<a href="https://colab.research.google.com/github/panditamey1/nlp_refresher/blob/main/nlp_refresher_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn import metrics

import string
import spacy
np.random.seed(42)

In [5]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/jigsaw-toxic-comment-train.csv")

In [6]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [25]:
data = data[:5000]

In [26]:
nlp = spacy.load("en_core_web_sm")


In [27]:
stop_words = nlp.Defaults.stop_words

In [28]:
print(stop_words)

{'nor', 'by', 'whether', 'anyway', 'your', 'together', 'elsewhere', 'rather', 'bottom', 'seem', 'using', 'very', 'all', '‘ve', 'yourself', 'has', 'four', 'been', 'nevertheless', 'everyone', 'might', 'what', 'still', "'ll", 'must', 'perhaps', 'per', 'other', 'are', 'us', 'hereupon', 'themselves', 'moreover', 'also', 'whenever', 'fifteen', 'few', 'same', 'part', 'becoming', 'she', 'quite', '‘m', 'nothing', 'put', 'as', 'next', 'nine', 'please', 'for', 'seeming', 'whatever', 'full', 'behind', 'get', 'unless', 'ca', 'somewhere', 'anyone', 'whoever', 'this', '‘ll', 'is', 'two', 'these', 'last', 'there', 'regarding', 'could', 'here', 'our', 'whereby', 'whereafter', 'why', 'against', 'ourselves', 'fifty', 'had', 'became', '‘re', 'give', 'one', 'nowhere', 'third', 'whither', 'first', 'else', 'neither', 'someone', 'go', 'i', '‘d', 'seems', 'whence', 'after', 'yet', 'already', 'either', 'yours', 'now', 'at', 'much', 'them', 'so', 'serious', 'cannot', "'ve", 'its', 'too', 'otherwise', 'when', 'si

In [29]:
punctuation = string.punctuation

In [30]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [31]:
# Tokenizer function
#s = "Harry is a ,triwizard champion."
#doc = nlp(s)
#my_words = [word.lemma_.lower().strip() for word in doc]
#print(my_words)
#final_words = [word for word in my_words if word not in stop_words and word not in punctuation]
#print(final_words)

def spacy_tokenizer(sentence):
  doc = nlp(sentence)
  #lemma_ = root word 
  my_tokens = [word.lemma_.lower().strip() for word in doc]

  final_words = [word for word in my_tokens if word not in stop_words and word not in punctuation]

  return final_words


In [32]:
s = "Harry is a ,triwizard champion."
spacy_tokenizer(s)

['harry', 'triwizard', 'champion']

In [33]:
count_vector = CountVectorizer(tokenizer = spacy_tokenizer)


In [34]:
count_vector.fit_transform(["I can be harry Potter.; YOu know", "My passport works better in Canada"])

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [35]:
count_vector.fit_transform(["I can be harry Potter.; YOu know", "My passport works better in Canada"]).toarray()

array([[0, 1, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 1]])

In [36]:
count_vector.get_feature_names_out()

array(['canada', 'harry', 'know', 'passport', 'potter', 'work'],
      dtype=object)

In [37]:
count_vector.vocabulary_

{'harry': 1, 'potter': 4, 'know': 2, 'passport': 3, 'work': 5, 'canada': 0}

### Split data

In [38]:
from sklearn.model_selection import train_test_split

x = data["comment_text"]
y = data["toxic"]



In [39]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, stratify = y)

In [40]:
x_train.head()

553     iPad talk page \n\nWhy did you remove that IP'...
2483                                Bold text Italic text
1796    No big deal , essentially you have re-added a ...
4952    (posted on the discussion page at 00:31 UTC on...
1030    Proposed approach \n\nThis section is intended...
Name: comment_text, dtype: object

In [41]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()


In [42]:
x_train_vectors = count_vector.fit_transform(x_train)
x_test_vectors = count_vector.transform(x_test)

In [43]:
x_train_vectors.shape

(4000, 18933)

In [44]:
x_test_vectors.shape


(1000, 18933)

In [45]:
classifier.fit(x_train_vectors, y_train)

LogisticRegression()

In [46]:
predicted = classifier.predict(x_test_vectors)


In [47]:
print("accuracy : ", metrics.accuracy_score(y_test,predicted) )
print("precision : ", metrics.precision_score(y_test,predicted))
print("recall : ", metrics.recall_score(y_test,predicted))

accuracy :  0.931
precision :  0.8478260869565217
recall :  0.38613861386138615


### tf idf

In [48]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)





In [49]:
xtrain_tfidf_vectors = tfidf_vector.fit_transform(x_train)
xtest_tfidf_vectors = tfidf_vector.transform(x_test)

In [53]:
xtrain_tfidf_vectors.shape
y_train.shape

(4000,)

In [51]:
tfidf_classifier = LogisticRegression()

tfidf_classifier.fit(xtrain_tfidf_vectors,y_train)

predicted_tfidf = tfidf_classifier.predict(xtest_tfidf_vectors)


In [52]:
print("tfidf accuracy  : ", metrics.accuracy_score(y_test, predicted_tfidf))
print("tfidf precision  : ", metrics.precision_score(y_test, predicted_tfidf))
print("tfidf recall  : ", metrics.recall_score(y_test, predicted_tfidf))


tfidf accuracy  :  0.916
tfidf precision  :  1.0
tfidf recall  :  0.16831683168316833
