In [36]:
# data received from: 
# https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

#==================================================
# List of all packages used
#==================================================
import pandas as pd # data grooming
import numpy as np 
import matplotlib.mlab as mlab # viz
import matplotlib.pyplot as plt # viz

from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf

from sklearn.linear_model import LogisticRegression # regression model
from sklearn.metrics import roc_auc_score # confirm regression

from sklearn.svm import SVC #SVM
from sklearn.linear_model.stochastic_gradient import SGDClassifier #confirm SVM

import warnings
warnings.filterwarnings('ignore') # these hide any update warnings

In [3]:
#==================================================
# Data Prep, Exploration, and Visualization
#==================================================
# import training data and confirm that it has uploaded correctly
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [23]:
# import test data for later use
test = pd.read_csv('test.csv')
test_labels = pd.read_csv('test_labels.csv')

In [4]:
# review training data summary
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# confirm values are binary
train['toxic'].unique()

array([0, 1])

In [6]:
train['threat'].unique()

array([0, 1])

In [22]:
# I tried to create a loop for this, but wasn't able to get my function 
# to loop through each column.
# for col in train.columns[2:] #start on 3rd column

# Investigate the distribution of comment types
Total_Toxic = train['toxic'].sum()
Total_SevereToxic = train['severe_toxic'].sum()
Total_Obscene = train['obscene'].sum()
Total_Threat = train['threat'].sum()
Total_Insult = train['insult'].sum()
Total_IdentityHate = train['identity_hate'].sum()

In [16]:
Total_Toxic

15294

In [17]:
Total_SevereToxic

1595

In [18]:
Total_Obscene

8449

In [19]:
Total_Threat

478

In [20]:
Total_Insult

7877

In [21]:
Total_IdentityHate

1405

In [26]:
# ==========================================
# Logistic Regression
# ===========================================
# combine test and training data for regression model
df = pd.concat([train['comment_text'], test['comment_text']], axis=0)
df = df.fillna("unknown")

# confirm data was concatenated correctly
df.head()
# count unique
df.describe()

count                                                312735
unique                                               312735
top       "\n\n Requested move \n\nSuggestion 1: Creatio...
freq                                                      1
Name: comment_text, dtype: object

In [25]:
# check training data shape
nrow_train = train.shape[0]
nrow_train

159571

In [31]:
# create tf-idf matrix
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
X = vectorizer.fit_transform(df)

In [32]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [33]:
# make sure predictions stay between 0 and 1
preds = np.zeros((test.shape[0], len(col)))

In [34]:
loss = []

for i, j in enumerate(col):
    print('===Fit '+j)
    model = LogisticRegression()
    model.fit(X[:nrow_train], train[j])
    preds[:,i] = model.predict_proba(X[nrow_train:])[:,1]
    
    pred_train = model.predict_proba(X[:nrow_train])[:,1]
    print('ROC AUC:', roc_auc_score(train[j], pred_train))
    loss.append(roc_auc_score(train[j], pred_train))
    
print('mean column-wise ROC AUC:', np.mean(loss))

===Fit toxic
ROC AUC: 0.9840021886922526
===Fit severe_toxic
ROC AUC: 0.9922863010182253
===Fit obscene
ROC AUC: 0.9930138882616191
===Fit threat
ROC AUC: 0.9952870254805043
===Fit insult
ROC AUC: 0.9873523688474103
===Fit identity_hate
ROC AUC: 0.9900204312573443
mean column-wise ROC AUC: 0.9903270339262259


In [37]:
# ===========================================
# SVM
# ===========================================

train_comments = train['comment_text']
test_comments = test['comment_text']

all_comments = pd.concat([train_comments, test_comments]) #df


labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] #col

tfidf = vectorizer.fit(all_comments)

test_comment_features = vectorizer.transform(test_comments)

train_comment_features = vectorizer.transform(train_comments)

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in labels:
    train_target = train[class_name]
    classifier = SGDClassifier(loss='modified_huber', penalty='l2', alpha=0.01, random_state=42, max_iter=20, tol=None)
    
    cv_score = np.mean(cross_val_score(classifier, train_comment_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_comment_features, train_target)
    submission[class_name] = classifier.predict_proba(test_comment_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9445398608763157
CV score for class severe_toxic is 0.9829580286228278
CV score for class obscene is 0.9766593896806093
CV score for class threat is 0.976651259036732
CV score for class insult is 0.9653311270359103
CV score for class identity_hate is 0.9677941040399404
Total CV score is 0.9689889615487227
