In [2]:
import os
from pathlib import Path

import pandas as pd

In [3]:
INPUT_COL = 'Processed comment_text'
OUTPUT_COL = 'toxic'

CWD = Path(os.getcwd()).parent.parent.parent
DATA_DIR = Path('data/preprocessed/jigsaw_toxic_comments')

In [4]:
DATA_DIR = CWD / DATA_DIR

train_data = pd.read_csv(DATA_DIR / 'train.csv')
validation_data = pd.read_csv(DATA_DIR / 'test.csv')

In [5]:
# Count of NA values in Processed comment
train_data[train_data.isnull().any(axis=1)].head()

Unnamed: 0,id,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
2098,05ac7a7a83e4c63a,"No, it doesn´t.80.228.65.162",,0,0,0,0,0,0
2407,067638a445ccd93b,"Here, here and here.",,0,0,0,0,0,0
3990,0aa6f3529219b37e,From here\n\nFrom here 160.80.2.8,,0,0,0,0,0,0
4482,0bed2196c873636d,1993\n\n1994\n\n1995\n\n1996\n\n1997\n\n1998\n...,,0,0,0,0,0,0
6063,10304d5db5fb5821,n i g g e r f a g g o t,,1,1,0,0,0,1


In [6]:
import re
import string

def clean_text(text):

    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

validation_data = validation_data[validation_data.toxic != -1]

train_data['Processed comment_text'] = train_data['comment_text'].apply(clean_text)
validation_data['Processed comment_text'] = validation_data['comment_text'].apply(clean_text)

In [7]:
# train_data = train_data.dropna(subset=[INPUT_COL])
train_data.head()

Unnamed: 0,id,comment_text,Processed comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,ExplanationWhy the edits made under my usernam...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",MoreI cant make any real suggestions on improv...,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",You sir are my hero Any chance you remember wh...,0,0,0,0,0,0


# Target Feature Engineering

In [8]:
def test_train(test_size=0.2):
    from sklearn.model_selection import train_test_split

    return train_test_split(
        train_data[INPUT_COL].values,
        train_data[OUTPUT_COL].values,
        test_size=test_size
    )

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [15]:
train_data[OUTPUT_COL].values

array([0, 0, 0, ..., 0, 0, 0])

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, accuracy_score, auc, f1_score

# X_train, X_test, y_train, y_test = test_train()
tfidf_vectorizer = TfidfVectorizer() 
X_train = tfidf_vectorizer.fit_transform(train_data[INPUT_COL])
X_test = tfidf_vectorizer.transform(validation_data[INPUT_COL])

scores = {
    'train_accuracy': [],
    'test_accuracy': [],
    'f1_score': [],
    'auc_score': [],
}

for col in train_data.columns[3:]:
    OUTPUT_COL = col
    lr = LogisticRegression(max_iter=200)
    lr.fit(X_train, train_data[OUTPUT_COL].values)
    
    yt_pred = lr.predict(X_train).reshape(-1, 1)
    y_pred = lr.predict(X_test).reshape(-1, 1)
    y_pred_proba = lr.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(validation_data[OUTPUT_COL].values, y_pred_proba)
    
    scores['train_accuracy'] += accuracy_score(yt_pred, train_data[OUTPUT_COL].values),
    scores['test_accuracy'] += accuracy_score(y_pred, validation_data[OUTPUT_COL].values),
    scores['auc_score'] += auc(fpr, tpr),
    scores['f1_score'] += f1_score(validation_data[OUTPUT_COL].values, y_pred),

In [47]:
train_data.columns[3:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [52]:
scores

{'train_accuracy': [0.9610894210100833,
  0.99106353911425,
  0.978717937469841,
  0.997355409190893,
  0.973077814891177,
  0.9923231664901517],
 'test_accuracy': [0.9375566601019101,
  0.9933883522460846,
  0.966941761230423,
  0.9969520772765638,
  0.963362405827003,
  0.9901841257932414],
 'f1_score': [0.673370942686616,
  0.3380281690140845,
  0.6759613911444768,
  0.3344709897610922,
  0.5927727588603197,
  0.3319148936170213],
 'auc_score': [0.9572003261379656,
  0.9760217255451296,
  0.971577058633521,
  0.9862120960662697,
  0.9642095192183587,
  0.9753675248291768]}

In [77]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# X_train, X_test, y_train, y_test = test_train()
tfidf_vectorizer = TfidfVectorizer() 
X_train = tfidf_vectorizer.fit_transform(train_data[INPUT_COL])
X_test = tfidf_vectorizer.transform(validation_data[INPUT_COL])

dt = DecisionTreeClassifier()
dt = dt.fit(X_train, train_data[OUTPUT_COL])
dt.score(X_test, validation_data[OUTPUT_COL])
roc_auc_score(validation_data[OUTPUT_COL], lr.predict_proba(X_test)[:,1])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = test_train()
tfidf_vectorizer = TfidfVectorizer() 
X_train = tfidf_vectorizer.fit_transform(train_data[INPUT_COL])
X_test = tfidf_vectorizer.transform(validation_data[INPUT_COL])

knn = KNeighborsClassifier(
    n_neighbors=5,
    n_jobs=-1
)

knn = knn.fit(X_train, train_data[OUTPUT_COL])
knn.score(X_test, validation_data[OUTPUT_COL])
roc_auc_score(validation_data[OUTPUT_COL], lr.predict_proba(X_test)[:,1])

NameError: name 'test_train' is not defined