In [1]:
import sys
sys.path.append(r"../")

from src.data.data_preprocessing import preprocess_text_parallel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../data/interim/combined.tsv', sep='\t', header=0)
df.head()

Unnamed: 0,toxic-en,neutral-en
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t..."
1,you're becoming disgusting.,Now you're getting nasty.
2,"well, we can spare your life.","Well, we could spare your life, for one."
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it."
4,I have orders to kill her.,I've got orders to put her down.


In [4]:
toxic_df = df[['toxic-en']]
toxic_df.columns = ['text']
toxic_df['toxic'] = 1

neutral_df = df[['neutral-en']]
neutral_df.columns = ['text']
neutral_df['toxic'] = 0

In [5]:
df = pd.concat([toxic_df, neutral_df], ignore_index=True)
df.head()

Unnamed: 0,text,toxic
0,"if Alkar floods her with her mental waste, it ...",1
1,you're becoming disgusting.,1
2,"well, we can spare your life.",1
3,"monkey, you have to wake up.",1
4,I have orders to kill her.,1


In [6]:
df['text'] = preprocess_text_parallel(df['text'])

Processing:   0%|          | 0/1262462 [00:00<?, ?it/s]

In [7]:
df = df[df['text'].str.len() > 0]
df.head()

Unnamed: 0,text,toxic
0,alkar flood mental wast would explain high lev...,1
1,your becom disgust,1
2,well spare life,1
3,monkey wake,1
4,order kill,1


In [8]:
df['toxic'].value_counts()

toxic
1    631210
0    627771
Name: count, dtype: int64

In [43]:
X = df['text'].to_numpy()
y = df['toxic'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

vectorizer = CountVectorizer(max_features=10000)
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

clf = LogisticRegression(solver='newton-cg', verbose=1, random_state=42, penalty='l2')
clf.fit(X_train, y_train)

In [44]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79    125555
           1       0.81      0.75      0.78    126242

    accuracy                           0.79    251797
   macro avg       0.79      0.79      0.79    251797
weighted avg       0.79      0.79      0.79    251797



### Running some tests

In [37]:
tests = [
    'Fuck you',
    'I will kill you',
    'You are stupid clown',
    'You are very cool',
    'It is impossible to complete this',
    'Are you ok?'
]

for case in tests:
    print(case + ':', 'toxic' if clf.predict(vectorizer.transform([case]))[0] == 1 else 'neutral')

Fuck you: toxic
I will kill you: toxic
You are stupid clown: toxic
You are very cool: neutral
It is impossible to complete this: neutral
Are you ok?: neutral


### Calculating cross-entropy loss

In [45]:
y_pred_proba = clf.predict_proba(X_test)
pred = y_pred_proba[:, 1]
-np.sum(y_test * np.log(pred) + (1 - y_test) * np.log(1 - pred + 1e-10)) / len(y_test)

0.4381693579519746