# Exploration of the given data

In [1]:
import sys
sys.path.append(r"../")

In [2]:
from src.data.data_reading import read_raw_data

df = read_raw_data()
df.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


## Analyze the dataframe

### Check columns and their types

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 577777 entries, 0 to 577776
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   reference    577777 non-null  object 
 1   translation  577777 non-null  object 
 2   similarity   577777 non-null  float64
 3   lenght_diff  577777 non-null  float64
 4   ref_tox      577777 non-null  float64
 5   trn_tox      577777 non-null  float64
dtypes: float64(4), object(2)
memory usage: 30.9+ MB


### Check values of the columns

In [4]:
df.describe()

Unnamed: 0,similarity,lenght_diff,ref_tox,trn_tox
count,577777.0,577777.0,577777.0,577777.0
mean,0.758469,0.157652,0.541372,0.43449
std,0.092695,0.108057,0.457571,0.458904
min,0.600001,0.0,3.3e-05,3.3e-05
25%,0.681105,0.066667,0.012171,0.000707
50%,0.754439,0.141791,0.806795,0.085133
75%,0.831244,0.238095,0.990469,0.973739
max,0.95,0.4,0.999724,0.99973


### Check for missing values

In [5]:
df.isnull().sum()

reference      0
translation    0
similarity     0
lenght_diff    0
ref_tox        0
trn_tox        0
dtype: int64

### Change order of values in row (reference and translation) if ref_tox > trn_tox

In [6]:
df.loc[df['ref_tox'] < df['trn_tox'], ['reference', 'translation', 'ref_tox', 'trn_tox']] = \
    df.loc[df['ref_tox'] < df['trn_tox'], ['translation', 'reference', 'trn_tox', 'ref_tox']].values

In [7]:
df.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.785171,0.010309,0.981983,0.014195
1,you're becoming disgusting.,Now you're getting nasty.,0.749687,0.071429,0.999039,0.065473
2,"well, we can spare your life.","Well, we could spare your life, for one.",0.919051,0.268293,0.985068,0.213313
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.664333,0.309524,0.994215,0.053362
4,I have orders to kill her.,I've got orders to put her down.,0.726639,0.181818,0.999348,0.009402


### Modify dataset for training

In [8]:
df = df.rename(columns={"reference": "toxic-en", "translation": "neutral-en"})[['toxic-en', 'neutral-en']]
df.head()

Unnamed: 0,toxic-en,neutral-en
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t..."
1,you're becoming disgusting.,Now you're getting nasty.
2,"well, we can spare your life.","Well, we could spare your life, for one."
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it."
4,I have orders to kill her.,I've got orders to put her down.


### Add result to combined dataset (data/interim/combined.tsv)

In [9]:
from src.data.data_collector import add_to_combined

print('Total combined size:', len(add_to_combined(df, is_initial=True)))

Unnamed: 0,toxic-en,neutral-en
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t..."
1,you're becoming disgusting.,Now you're getting nasty.
2,"well, we can spare your life.","Well, we could spare your life, for one."
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it."
4,I have orders to kill her.,I've got orders to put her down.
...,...,...
577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...
577773,It'il suck the life out of you!,you'd be sucked out of your life!
577774,"I can't fuckin' take that, bruv.",I really can't take this.
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care."


In [10]:
assert False

AssertionError: 

In [None]:
import pandas as pd
part1 = df[['reference', 'ref_tox']].rename(columns={'reference': 'text', 'ref_tox': 'toxicity'})
part2 = df[['translation', 'trn_tox']].rename(columns={'translation': 'text', 'trn_tox': 'toxicity'})
combined = pd.concat([part1, part2])

In [None]:
combined

In [None]:
import re

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm.notebook import tqdm
import numpy as np
from joblib import Parallel, delayed

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

regex_digit = re.compile(r'\d+')
regex_non_alpha = re.compile(r'[^a-z|\s]+')
regex_spaces = re.compile('\s+')

def preprocess_text(text):
    text = text.lower()
    text = regex_digit.sub('', text)
    text = regex_non_alpha.sub('', text)
    text = regex_spaces.sub(' ', text).strip()

    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [stemmer.stem(word) for word in text]

    return ' '.join(text)

def preprocess_text_parallel(df: pd.DataFrame, n_jobs=-1):
    text_data = df['text'].to_numpy()
    
    with Parallel(n_jobs=n_jobs, backend='loky') as parallel:
        result = parallel(delayed(preprocess_text)(text) for text in tqdm(text_data, desc="Processing"))

    df['text'] = result
    return df[df['text'].str.len() > 0]

In [None]:
text_data = combined['text'].to_numpy()
text_data

In [None]:
df = preprocess_text_parallel(combined)

In [None]:
df

In [None]:
# baseline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df['text'].to_numpy()
y = df['toxicity'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

vectorizer = CountVectorizer(max_features=10000)
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train > 0.5)

y_pred = clf.predict(X_test)
print(classification_report(y_test > 0.5, y_pred))

print(clf.predict_proba(vectorizer.transform(['This is a test'])))

In [None]:
y_pred_proba = clf.predict_proba(X_test)
pred = y_pred_proba[:, 1]
# cross entropy
-np.sum(y_test * np.log(pred) + (1 - y_test) * np.log(1 - pred + 0.00000001)) / len(y_test)

In [None]:
pred[pred > 0.9999].sum()

In [None]:
# baseline with tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df['text'].to_numpy()
y = df['toxicity'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train, y_train > 0.5)

y_pred = clf.predict(X_test)
print(classification_report(y_test > 0.5, y_pred))

In [None]:
y_pred_proba = clf.predict_proba(X_test)
pred = y_pred_proba[:, 1]
# cross entropy
-np.sum(y_test * np.log(pred) + (1 - y_test) * np.log(1 - pred)) / len(y_test)