In [1]:
import pandas as pd
import numpy as np
np.random.seed(2022)

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange


In [3]:
data = pd.read_csv("D:\\Data\\NLP\\pradip\\toxic_commnets.csv",on_bad_lines='error',skiprows = range(1000, 159570), engine="python")
data.tail()

Unnamed: 0,comment_text,toxic
996,It is common knowledge that Karaims (but not K...,0
997,", 12 April 2006 (UTC)\nThen rewrite and expand...",0
998,"""I was trying to inject some humour (as eviden...",0
999,And it looks like it was actually you who put ...,0
1000,"""\nAnd ... I really don't think you understand...",0


In [4]:
import spacy
import string
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'another', 'enough', 'does', 'this', 'make', 'made', 'most', 'may', 'onto', 'down', 'anywhere', 'we', 'whereby', 'amongst', 'that', 'eleven', 'have', 'since', 'twenty', 'alone', 'latter', 'less', 'latterly', 'be', 'what', 'not', 'side', 'under', 'its', 'when', 'were', 'some', 'as', 'show', 'thus', 'many', 'besides', 'become', 'least', 'eight', 'last', 'five', 'two', 'sometime', 'n’t', 'elsewhere', 'used', 'do', 'nor', 'few', 'been', 'empty', '’ve', 'hereafter', 'using', 'too', 'more', 'own', 'namely', 'former', 'four', 'already', 'anyway', 'either', 'him', 'each', 'an', 'afterwards', 'ours', 'these', 'much', 'somehow', 'say', 're', 'something', 'perhaps', 'sixty', 'about', 'from', 'towards', 'myself', 'nine', 'i', 'both', 'whether', "n't", 'along', 'beforehand', 'hers', 'also', 'often', 'neither', 'anything', 'next', 'you', 'whoever', 'sometimes', 'it', 'any', 'at', 'among', 'over', 'top', 'together', 'full', 'with', 'across', 'now', 'so', 'yours', 'whose', 'my', 'a', 'hereupon', 'mor

In [5]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)



    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    sentence = " ".join(mytokens)
    # return preprocessed list of tokens
    return sentence

In [7]:
data['tokenize'] = data['comment_text'].apply(spacy_tokenizer)
data.head()

Unnamed: 0,comment_text,toxic,tokenize
0,Explanation\nWhy the edits made under my usern...,0,explanation edit username hardcore metallica f...
1,D'aww! He matches this background colour I'm s...,0,d'aww match background colour seemingly stick ...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man try edit war guy constantly remove rel...
3,"""\nMore\nI can't make any real suggestions on ...",0,real suggestion improvement wonder section sta...
4,"You, sir, are my hero. Any chance you remember...",0,sir hero chance remember page


In [8]:
data['embeddings'] = data['tokenize'].apply(model.encode)
data.head()

Unnamed: 0,comment_text,toxic,tokenize,embeddings
0,Explanation\nWhy the edits made under my usern...,0,explanation edit username hardcore metallica f...,"[-0.030692976, 0.04184737, 0.041614547, 0.0137..."
1,D'aww! He matches this background colour I'm s...,0,d'aww match background colour seemingly stick ...,"[-0.053062137, 0.058449294, 0.04401034, -0.005..."
2,"Hey man, I'm really not trying to edit war. It...",0,hey man try edit war guy constantly remove rel...,"[-0.0210479, 0.026711857, 0.0035101357, 0.0359..."
3,"""\nMore\nI can't make any real suggestions on ...",0,real suggestion improvement wonder section sta...,"[-0.047509685, -0.03601594, 0.04249648, 0.0576..."
4,"You, sir, are my hero. Any chance you remember...",0,sir hero chance remember page,"[-0.09303554, 0.008844826, -0.07133498, -0.007..."


In [9]:
X = data['embeddings'].to_list()
y = data['toxic'].to_list()

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [11]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train,y_train)     

In [12]:
from sklearn import metrics
predicted = LR.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9203980099502488
Logistic Regression Precision: 0.8571428571428571
Logistic Regression Recall: 0.2857142857142857
