# Toxic Comments

### Import and refine data

In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import time

In [3]:
#import train set into notebook
train = pd.read_csv('C:/Users/mtrig/Documents/data science/toxic comments/data/train.csv')

In [4]:
#import test set into notebook
test = pd.read_csv('C:/Users/mtrig/Documents/data science/toxic comments/data/test.csv')

In [5]:
#peek at the data
train.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
19117,326f2b7604034d4d,". I am a stupid bitch for blocking this user, ...",1,0,1,0,1,0
149035,560a81db5d1dbf47,"""\nI would suggest that is is you that needs t...",0,0,0,0,0,0
100643,1aaa1e613d60f26e,"""==Notability of Nashaa Weiss==\nA tag has bee...",0,0,0,0,0,0
40766,6ccc0788d6023e0b,The deletion thing does not say you are unimpo...,0,0,0,0,0,0
40438,6be7ab1a9ba990b6,You honestly think it should be merged? Now I ...,0,0,0,0,0,0


In [6]:
test.shape

(153164, 2)

In [7]:
train.shape

(159571, 8)

In [8]:
#replace NaN values
train = train.fillna("unknown")
test = test.fillna("unknown")

### Set up X and Y arrays for test and train

In [9]:
X_train = train.comment_text

In [10]:
X_test = test.comment_text

In [11]:
#frames = [X_train, X_test]
#all_X = pd.concat(frames)

In [12]:
#Vectorize the train comments sample and convert to document term matrix
vect = TfidfVectorizer()
X_train_dtm = vect.fit_transform(X_train)

In [13]:
#all_X_vec = vect.fit(all_X)

In [14]:
#X_dtm = vect.transform(X_train)

In [15]:
X_train_dtm

<159571x189775 sparse matrix of type '<class 'numpy.float64'>'
	with 6949691 stored elements in Compressed Sparse Row format>

In [16]:
#Vectorize the test sample already fitted on train sample
X_test_dtm = vect.transform(X_test)

In [17]:
X_test_dtm

<153164x189775 sparse matrix of type '<class 'numpy.float64'>'
	with 5738810 stored elements in Compressed Sparse Row format>

### Import, initialise and fit model

In [18]:
#target columns
cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [19]:
target_y = train[cols]

In [20]:
model = LogisticRegression()

In [22]:
# Split into train and validation set to fit model and evaluate performance
start_time = time.time()
X_train, X_valid, y_train, y_valid = train_test_split(X_train_dtm, target_y, test_size=0.33, random_state=2018)
train_loss = []
valid_loss = []
importance=[]
preds_train = np.zeros((X_train.shape[0], len(cols)))
preds_valid = np.zeros((X_valid.shape[0], len(cols)))
for i, j in enumerate(cols):
    print('Class:= '+j)
    model.fit(X_train,y_train[j])
    preds_valid[:,i] = model.predict_proba(X_valid)[:,1]
    preds_train[:,i] = model.predict_proba(X_train)[:,1]
    train_loss_class=log_loss(y_train[j],preds_train[:,i])
    valid_loss_class=log_loss(y_valid[j],preds_valid[:,i])
    print('Trainloss=log loss:', train_loss_class)
    print('Validloss=log loss:', valid_loss_class)
    importance.append(model.coef_)
    train_loss.append(train_loss_class)
    valid_loss.append(valid_loss_class)
print('mean column-wise log loss:Train dataset', np.mean(train_loss))
print('mean column-wise log loss:Validation dataset', np.mean(valid_loss))

end_time=time.time()
print("total time for model",end_time-start_time)

Class:= toxic
Trainloss=log loss: 0.104805693271
Validloss=log loss: 0.120602527002
Class:= severe_toxic
Trainloss=log loss: 0.0238088444372
Validloss=log loss: 0.0278490462775
Class:= obscene
Trainloss=log loss: 0.0594044378828
Validloss=log loss: 0.0690554987667
Class:= threat
Trainloss=log loss: 0.00940643030504
Validloss=log loss: 0.0112640507173
Class:= insult
Trainloss=log loss: 0.0687199136058
Validloss=log loss: 0.0836718078672
Class:= identity_hate
Trainloss=log loss: 0.0236732166029
Validloss=log loss: 0.0295354562586
mean column-wise log loss:Train dataset 0.0483030893508
mean column-wise log loss:Validation dataset 0.0569963978149
total time for model 94.66555428504944


In [21]:
#make predictions on test set
start_time = time.time()
preds_test = np.zeros((test.shape[0], len(cols)))
for i, j in enumerate(cols):
    model.fit(X_train_dtm, target_y[j])
    print('Class:= '+j)
    preds_test[:,i] = model.predict_proba(X_test_dtm)[:,1]

end_time=time.time()
print("total time:",end_time-start_time)

Class:= toxic
Class:= severe_toxic
Class:= obscene
Class:= threat
Class:= insult
Class:= identity_hate
total time till Indirect feat model 127.06583976745605


In [22]:
preds_test

array([[  9.98337160e-01,   1.79390376e-01,   9.94814648e-01,
          4.62070355e-02,   9.61402431e-01,   2.80958967e-01],
       [  6.59956592e-03,   1.56288482e-03,   3.99668472e-03,
          4.33037669e-04,   5.72391538e-03,   3.05811246e-03],
       [  4.24095470e-02,   6.40249584e-03,   2.08041452e-02,
          1.98726264e-03,   2.34432325e-02,   6.13588420e-03],
       ..., 
       [  6.04189206e-03,   1.43522807e-03,   5.59173323e-03,
          7.42073665e-04,   4.37652894e-03,   1.84344656e-03],
       [  2.29514324e-02,   1.46851636e-03,   1.11333139e-02,
          9.43469210e-04,   1.14498139e-02,   8.98091020e-03],
       [  9.47327731e-01,   3.31298371e-03,   6.26434369e-01,
          3.95935140e-03,   3.57220469e-01,   1.20274264e-02]])

In [23]:
preds_test.shape

(153164, 6)

### Create submission file

In [24]:
submid = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds_test, columns = cols)], axis=1)

In [25]:
submission.id = submission.id.astype(str)

In [26]:
submission.to_csv('C:/Users/mtrig/Documents/data science/toxic comments/data/submission.csv', index = False)

In [27]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998337,0.17939,0.994815,0.046207,0.961402,0.280959
1,0000247867823ef7,0.0066,0.001563,0.003997,0.000433,0.005724,0.003058
2,00013b17ad220c46,0.04241,0.006402,0.020804,0.001987,0.023443,0.006136
3,00017563c3f7919a,0.003176,0.001637,0.003099,0.001038,0.003708,0.00093
4,00017695ad8997eb,0.034391,0.004016,0.01118,0.001724,0.011583,0.00361
