Install & import necessary packages

In [1]:
!pip install -U sentence-transformers -q

[K     |████████████████████████████████| 85 kB 2.6 MB/s 
[K     |████████████████████████████████| 4.9 MB 34.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 53.6 MB/s 
[K     |████████████████████████████████| 120 kB 45.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 64.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
np.random.seed(2022)

In [3]:
from sentence_transformers import SentenceTransformer

# There are various models available under SentenceTransformer, let us pick 'all-MiniLM-L6-v2' for our example & download the same
model = SentenceTransformer('all-MiniLM-L6-v2')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [15]:
data = pd.read_csv("train.csv", error_bad_lines=False, usecols = ['comment_text', 'toxic'],
                   engine='python')
data = data[:1000]  #pick only 1000 rows
print(len(data))
print((data.shape))

1000
(1000, 2)


Skipping line 72531: unexpected end of data


In [16]:
data.head()

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [17]:
import spacy
import string
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words  # download english stop words
print(stop_words)

{'hereby', 'mostly', 'used', '‘ll', 'first', 'their', 'already', 'him', 'nine', 'under', 'between', 'during', '‘s', 'six', 'around', 'via', '’ll', 'whither', 'or', 'hundred', 'was', 'n‘t', 'i', 'side', 'have', 'himself', 'how', 'nowhere', 'also', 'even', '‘m', 'yours', 'to', 'no', 'amongst', 'per', 'me', 'throughout', 'full', 'rather', 'back', 'anyone', 'still', "'s", 'indeed', 'might', 'before', 'say', 'until', 'whom', 'from', 'it', 'her', 'is', '’m', 'bottom', "'re", 'my', 'four', 'whether', 'others', 'an', 'go', 'could', 'our', 'formerly', 'may', 'nobody', 'yet', 'less', 'own', 'whereby', 'am', 'again', 'noone', 'why', 'a', 'she', 'become', 'herself', 'had', 'however', "'m", 'put', 'almost', 'keep', 'any', 'he', 'because', 'mine', 'beside', 'up', 'onto', 'over', 'two', 'the', 'although', 'whole', 'more', 'somehow', 'then', 'his', 'next', 'so', 'while', 'such', 'must', 'part', 'something', 'alone', 'every', 'several', 'really', 'anyhow', 'wherein', 'please', 'whenever', 'we', 'of', '

In [18]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Function to tokenize sentences

In [19]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]


    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    sentence = " ".join(mytokens)
    # return preprocessed list of tokens
    return sentence

In [20]:
data['tokenize'] = data['comment_text'].apply(spacy_tokenizer)

In [21]:
data['embeddings'] = data['tokenize'].apply(model.encode) #apply sentence transformer model to create single embedding for whole sentence

In [22]:
data.head(5)

Unnamed: 0,comment_text,toxic,tokenize,embeddings
0,Explanation\nWhy the edits made under my usern...,0,explanation edit username hardcore metallica f...,"[-0.030692955, 0.041847356, 0.04161457, 0.0137..."
1,D'aww! He matches this background colour I'm s...,0,d'aww match background colour seemingly stuck ...,"[-0.054325636, 0.052362308, 0.048880484, -0.02..."
2,"Hey man, I'm really not trying to edit war. It...",0,hey man try edit war guy constantly remove rel...,"[-0.021047872, 0.026711842, 0.003510159, 0.035..."
3,"""\nMore\nI can't make any real suggestions on ...",0,real suggestion improvement wonder section sta...,"[-0.04789183, -0.03722783, 0.04027121, 0.05869..."
4,"You, sir, are my hero. Any chance you remember...",0,sir hero chance remember page,"[-0.09303556, 0.008844923, -0.07133498, -0.007..."


* The column 'embeddings' contains sentence embeddings for each of the sentences
* These embeddings can be used to check similarity between sentences
* Thus, sentence transformers can be used to generate feature embeddings which can be used directly as features to classical models too

In [23]:
X = data['embeddings'].to_list()
y = data['toxic'].to_list()

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train,y_train)

LogisticRegression()

In [25]:
from sklearn import metrics
predicted = LR.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.92
Logistic Regression Precision: 0.8571428571428571
Logistic Regression Recall: 0.2857142857142857
