<a href="https://colab.research.google.com/github/panditamey1/nlp_refresher/blob/main/nlp_refresher_5_text_classification_sentenceTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [8]:
np.random.seed(2022)


In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/jigsaw-toxic-comment-train.csv")

In [10]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [16]:
import spacy
import string


nlp = spacy.load("en_core_web_sm")

punctuations = string.punctuation

In [17]:
stop_words = nlp.Defaults.stop_words

In [27]:
def spacy_tokenizer(sentence):
  my_tokens = nlp(sentence)
  my_tokens = [word.lemma_.strip().lower() for word in my_tokens]
  my_tokens = [word for word in my_tokens if word not in stop_words and word not in punctuations]
  sentence = " ".join(my_tokens)
  return sentence

In [28]:
spacy_tokenizer("I have a football, but like to play cricket")

'football like play cricket'

In [32]:
data = data[:5000]

In [33]:
data["tokenize"] = data['comment_text'].apply(spacy_tokenizer)

In [34]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenize
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edit username hardcore metallica f...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww match background colour seemingly stuck ...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man try edit war guy constantly remove rel...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,real suggestion improvement wonder section sta...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page


In [35]:
data['sentence_embedding'] = data['tokenize'].apply(model.encode)

In [39]:
type(data['sentence_embedding'])


pandas.core.series.Series

In [40]:
type(data['toxic'])

pandas.core.series.Series

In [43]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokenize,sentence_embedding
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edit username hardcore metallica f...,"[-0.030692955, 0.041847356, 0.04161457, 0.0137..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww match background colour seemingly stuck ...,"[-0.054325636, 0.052362308, 0.048880484, -0.02..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man try edit war guy constantly remove rel...,"[-0.021047872, 0.026711842, 0.003510159, 0.035..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,real suggestion improvement wonder section sta...,"[-0.04789183, -0.03722783, 0.04027121, 0.05869..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page,"[-0.09303556, 0.008844923, -0.07133498, -0.007..."


In [41]:
X = data['sentence_embedding'].to_list()
y = data['toxic'].to_list()

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [48]:
from sklearn.linear_model import LogisticRegression

In [49]:
classifier = LogisticRegression()

classifier.fit(X_train, y_train)

preds = classifier.predict(X_test)

In [50]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       899
           1       0.91      0.43      0.58       101

    accuracy                           0.94      1000
   macro avg       0.93      0.71      0.77      1000
weighted avg       0.94      0.94      0.93      1000



In [51]:
print("Logistic Regression Accuracy:",accuracy_score(y_test, preds))
print("Logistic Regression Precision:",precision_score(y_test, preds))
print("Logistic Regression Recall:",recall_score(y_test, preds))

Logistic Regression Accuracy: 0.938
Logistic Regression Precision: 0.9148936170212766
Logistic Regression Recall: 0.42574257425742573
