In [1]:
import os
from pathlib import Path

import pandas as pd

In [2]:
INPUT_COL = 'Processed comment_text'
OUTPUT_COL = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

CWD = Path(os.getcwd()).parent.parent
DATA_DIR = Path('data/preprocessed/jigsaw_toxic_comments')

In [3]:
DATA_DIR = CWD / DATA_DIR

train_data = pd.read_csv(DATA_DIR / 'train.csv')
validation_data = pd.read_csv(DATA_DIR / 'test.csv')

In [4]:
# Count of NA values in Processed comment
train_data[train_data.isnull().any(axis=1)].head()

Unnamed: 0,id,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
2098,05ac7a7a83e4c63a,"No, it doesn´t.80.228.65.162",,0,0,0,0,0,0
2407,067638a445ccd93b,"Here, here and here.",,0,0,0,0,0,0
3990,0aa6f3529219b37e,From here\n\nFrom here 160.80.2.8,,0,0,0,0,0,0
4482,0bed2196c873636d,1993\n\n1994\n\n1995\n\n1996\n\n1997\n\n1998\n...,,0,0,0,0,0,0
6063,10304d5db5fb5821,n i g g e r f a g g o t,,1,1,0,0,0,1


In [5]:
train_data = train_data.dropna(subset=['Processed comment_text'])
train_data.head()

Unnamed: 0,id,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,explan edit made usernam hardcor metallica fan...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,daww match background colour seemingli stuck t...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",hey man realli tri edit war guy constantli rem...,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",cant make real suggest improv wonder section s...,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",sir hero chanc rememb page that,0,0,0,0,0,0


# Target Feature Engineering

In [6]:
TARGET_COL = 'target_1'
train_data[TARGET_COL] = train_data[OUTPUT_COL].agg(lambda x: x.astype(str).str.cat(sep=''), axis=1).map(int)

In [7]:
train_data.head()

Unnamed: 0,id,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,target_1
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,explan edit made usernam hardcor metallica fan...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,daww match background colour seemingli stuck t...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",hey man realli tri edit war guy constantli rem...,0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",cant make real suggest improv wonder section s...,0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",sir hero chanc rememb page that,0,0,0,0,0,0,0


In [8]:
validation_data = validation_data[validation_data['toxic'] != -1]
validation_data = validation_data.dropna(subset=['Processed comment_text'])
validation_data.head()

Unnamed: 0,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,Thank you for understanding. I think very high...,thank understand think highli would revert wit...,0,0,0,0,0,0
7,:Dear god this site is horrible.,dear god site horribl,0,0,0,0,0,0
11,"""::: Somebody will invariably try to add Relig...",somebodi invari tri add religion realli mean w...,0,0,0,0,0,0
13,""" \n\n It says it right there that it IS a typ...",say right type type institut need case three l...,0,0,0,0,0,0
14,""" \n\n == Before adding a new product to the l...",ad new product list make sure relev ad new pro...,0,0,0,0,0,0


In [9]:
validation_data[TARGET_COL] = validation_data[OUTPUT_COL].agg(lambda x: x.astype(str).str.cat(sep=''), axis=1).map(int)
validation_data.head()

Unnamed: 0,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,target_1
5,Thank you for understanding. I think very high...,thank understand think highli would revert wit...,0,0,0,0,0,0,0
7,:Dear god this site is horrible.,dear god site horribl,0,0,0,0,0,0,0
11,"""::: Somebody will invariably try to add Relig...",somebodi invari tri add religion realli mean w...,0,0,0,0,0,0,0
13,""" \n\n It says it right there that it IS a typ...",say right type type institut need case three l...,0,0,0,0,0,0,0
14,""" \n\n == Before adding a new product to the l...",ad new product list make sure relev ad new pro...,0,0,0,0,0,0,0


In [10]:
def test_train(test_size=0.2):
    from sklearn.model_selection import train_test_split

    return train_test_split(
        train_data[INPUT_COL].values,
        train_data[OUTPUT_COL].values,
        test_size=test_size
    )

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# X_train, X_test, y_train, y_test = test_train()
tfidf_vectorizer = TfidfVectorizer() 
X_train = tfidf_vectorizer.fit_transform(train_data[INPUT_COL])
X_test = tfidf_vectorizer.transform(validation_data[INPUT_COL])

lr = LogisticRegression(multi_class='multinomial', solver ='newton-cg')
lr.fit(X_train, train_data[TARGET_COL])
lr.score(X_test, validation_data[TARGET_COL])

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train, y_test = test_train()
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(train_data[INPUT_COL])
X_test = count_vectorizer.transform(validation_data[INPUT_COL])

alpha_ranges = {
    "alpha": [0.001, 0.01, 0.1, 1, 10.0, 100]
}

mnb = MultinomialNB(alpha=1)
mnb = mnb.fit(X_train, train_data[TARGET_COL])
mnb.score(X_test, validation_data[TARGET_COL])

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# X_train, X_test, y_train, y_test = test_train()
tfidf_vectorizer = TfidfVectorizer() 
X_train = tfidf_vectorizer.fit_transform(train_data[INPUT_COL])
X_test = tfidf_vectorizer.transform(validation_data[INPUT_COL])

dt = DecisionTreeClassifier()
dt = dt.fit(X_train, train_data[OUTPUT_COL])
dt.score(X_test, validation_data[OUTPUT_COL])

0.8547655068078669

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = test_train()
tfidf_vectorizer = TfidfVectorizer() 
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

knn = KNeighborsClassifier(
    n_neighbors=len(OUTPUT_COL),
    n_jobs=-1
)

knn = knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9043453724604966