Installing & importing the necessary libraries

In [2]:
!pip install spacy -q
!python -m spacy download en_core_web_sm   # download spacy english dictionary

import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import metrics

import string  # package used for punctuations
import spacy  # package for NLP/text processing
np.random.seed(42)

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 6.2 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [64]:
data = pd.read_csv("train.csv")
data.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [65]:
print(len(data))
print((data.shape))

159571
(159571, 8)


Filter out only first 1000 rows & select only 'toxic' & 'comment_text' columns

In [66]:
data = data[:1000][['comment_text', 'toxic']]
print(len(data))
print((data.shape))

1000
(1000, 2)


In [67]:
data.head(5)

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [68]:
# Load nlp spacy english dictionary & add stop words

nlp = spacy.load("en_core_web_sm")  # _sm for small
stop_words = nlp.Defaults.stop_words
print(stop_words) # view all stop words

{'own', 'may', 'has', 'must', 'here', 'an', 'seeming', 'is', 'below', 'why', 'across', 'her', 'upon', 'formerly', 'whatever', 'whenever', 'someone', 'do', 'amount', 'one', 'him', 'therefore', 'along', 'those', 'from', 'around', 'am', 'forty', 'hundred', "'d", 'enough', 'since', 'former', 'anything', 'thus', 'empty', 'somehow', 'make', 'unless', 'see', 'although', 'give', 'sometimes', 'does', 'three', 'five', 'together', 'some', 'alone', 'hereupon', 'whose', 'himself', 'which', 'anywhere', 'once', 'now', 'everywhere', 'only', 'also', 'never', 'yourselves', 'being', 'beforehand', 'while', 'sometime', 'otherwise', 'less', 'hence', 'a', 'show', 'others', 'about', 'becomes', 'moreover', 'whither', 'ten', '‘s', 'always', 'if', 'made', 'please', 'doing', 'themselves', 'neither', 'had', 'its', 'down', 'top', 'side', 'several', 'were', 'their', 'they', 'are', 'ca', 'in', 'wherein', 'without', 'mine', 'move', 'back', '‘ve', 'nothing', 'into', 'last', 'out', 'you', '’ll', 'we', 'where', 'over', '

In [69]:
# load the punctuations from string package
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [70]:
# Creating our tokenizer function
def spacy_tokenizer_sample(sentence):
    # Creating token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)
    print('The raw text: ')
    print('--'*50)
    print(doc)
    print(type(doc))  # this is a doc object
    print('--'*50)

    # Lemmatizing each token and converting each token into lowercase, strip() to remove white spaces
    mytokens = [ word.lemma_.lower().strip() for word in doc ]
    print('The tokenized text: ')
    print(mytokens)
    print('--'*50)

    # Removing stop words & punctuations from each of the sentences in our dataset
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    print('New tokens post removal of stop words & punctuations... ')
    print('--'*50)
    # return preprocessed list of tokens
    return mytokens

In [71]:
# Check the function working for sample sentence

sentence = "Are you gone nuts?? !! }"
spacy_tokenizer_sample(sentence)

The raw text: 
----------------------------------------------------------------------------------------------------
Are you gone nuts?? !! }
<class 'spacy.tokens.doc.Doc'>
----------------------------------------------------------------------------------------------------
The tokenized text: 
['be', 'you', 'go', 'nuts', '?', '?', '!', '!', '}']
----------------------------------------------------------------------------------------------------
New tokens post removal of stop words & punctuations... 
----------------------------------------------------------------------------------------------------


['nuts']

### Count Vectorizer

In [72]:
count_vector = CountVectorizer(tokenizer = spacy_tokenizer_sample)

In [73]:
count_vector.fit_transform(["I am eating apple, I like apple","I am playing cricket"]).toarray()

The raw text: 
----------------------------------------------------------------------------------------------------
i am eating apple, i like apple
<class 'spacy.tokens.doc.Doc'>
----------------------------------------------------------------------------------------------------
The tokenized text: 
['i', 'be', 'eat', 'apple', ',', 'i', 'like', 'apple']
----------------------------------------------------------------------------------------------------
New tokens post removal of stop words & punctuations... 
----------------------------------------------------------------------------------------------------
The raw text: 
----------------------------------------------------------------------------------------------------
i am playing cricket
<class 'spacy.tokens.doc.Doc'>
----------------------------------------------------------------------------------------------------
The tokenized text: 
['i', 'be', 'play', 'cricket']
----------------------------------------------------------------

array([[2, 0, 1, 1, 0],
       [0, 1, 0, 0, 1]], dtype=int64)

The numbers inside the array depict the no of times each word has appeared in the sentence/document

In [74]:
count_vector.get_feature_names_out()  # all feature names used/extracted

array(['apple', 'cricket', 'eat', 'like', 'play'], dtype=object)

In [75]:
count_vector.vocabulary_   # feature extracted along with its index

{'eat': 2, 'apple': 0, 'like': 3, 'play': 4, 'cricket': 1}

### Model with Count Vectorizer

In [76]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)
#     print('The raw text: ')
#     print('--'*50)
#     print(doc)
#     print(type(doc))  # this is a doc object
#     print('--'*50)

    # Lemmatizing each token and converting each token into lowercase, strip() to remove white spaces
    mytokens = [ word.lemma_.lower().strip() for word in doc ]
#     print('The tokenized text: ')
#     print(mytokens)
#     print('--'*50)

    # Removing stop words & punctuations from each of the sentences in our dataset
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
#     print('New tokens post removal of stop words & punctuations... ')
#     print('--'*50)
    # return preprocessed list of tokens
    return mytokens

In [77]:
from sklearn.model_selection import train_test_split

X = data['comment_text'] # the features
ylabels = data['toxic'] # the labels

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3,stratify=ylabels)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
count_vector = CountVectorizer(tokenizer = spacy_tokenizer)

#Extracting features for train and test data
X_train_vectors= count_vector.fit_transform(X_train)
X_test_vectors= count_vector.transform(X_test)


print(X_train_vectors.shape)  #4062 words/features are created
print(X_test_vectors.shape)

(700, 6620)
(300, 6620)


In [78]:
X_train_vectors.toarray()  # visualizing the matrix/representation created for the text data --its very sparse

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [79]:
classifier.fit(X_train_vectors,y_train)  # fitting the model

In [80]:
predicted = classifier.predict(X_test_vectors)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9166666666666666
Logistic Regression Precision: 1.0
Logistic Regression Recall: 0.21875


### Model with TFIDF 

In [81]:
# In TFIDF--> term freq means freq of word/term in a document, doc freq means freq of the word in the entire corpus
# if a word appears more frequently within document and also in the corpus --> more freq words --> less important (for ex: stop words)
# if a word appears more frequently within document and less in the corpus --> less freq or rare words --> more important & more weightage to such words
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

#Extracting features for train and test data
X_train_vector= tfidf_vector.fit_transform(X_train)
X_test_vector= tfidf_vector.transform(X_test)

print(X_train_vector.shape)  #4062 words/features are created
print(X_test_vector.shape)

(700, 6620)
(300, 6620)


In [82]:
classifier = LogisticRegression()
classifier.fit(X_train_vector,y_train)
predicted = classifier.predict(X_test_vector)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.8966666666666666
Logistic Regression Precision: 1.0
Logistic Regression Recall: 0.03125
