# Toxic Comments

### Import and refine data

In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
#import train set into notebook
train = pd.read_csv('C:/Users/mtrig/Documents/data science/toxic comments/data/train.csv')

In [3]:
#import test set into notebook
test = pd.read_csv('C:/Users/mtrig/Documents/data science/toxic comments/data/test.csv')

In [4]:
#peek at the data
train.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6103,104c768ad7aed401,"""\n\n May 16 2006 \n\nPlease stop. If you cont...",0,0,0,0,0,0
130,00537730daf8c5f1,"Sorry about that. I had checked, but had only...",0,0,0,0,0,0
145158,180c9b29c0d8c8e2,Sabata's Counterattack vs. Sabata's Revenge.\n...,0,0,0,0,0,0
55617,9499ce37a5b5862d,Lawrencekhoo please keep your insult to yourse...,0,0,0,0,0,0
3469,0959d4fdaf74a050,THIS DUDE THAT BLOCKED ME IS A FUCKING COCK SU...,1,1,1,0,1,1


In [5]:
test.shape

(153164, 2)

In [6]:
train.shape

(159571, 8)

In [7]:
#replace NaN values
train = train.fillna("unknown")
test = test.fillna("unknown")

### Set up X and Y arrays for test and train

In [8]:
X_train = train.comment_text

In [9]:
X_test = test.comment_text

In [10]:
#Vectorize the train comments sample and convert to sparse matrix
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)

In [11]:
X_train_dtm

<159571x189775 sparse matrix of type '<class 'numpy.int64'>'
	with 6949691 stored elements in Compressed Sparse Row format>

In [12]:
#Vectorize the test sample already fitted on train sample
X_test_dtm = vect.transform(X_test)

In [13]:
X_test_dtm

<153164x189775 sparse matrix of type '<class 'numpy.int64'>'
	with 5738810 stored elements in Compressed Sparse Row format>

### Import, initialise and fit model

In [14]:
#target columns
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [15]:
y_train = train[col]

In [16]:
#Initiate model
logreg = LogisticRegression()

In [17]:
#Create 'empty' array to receive predictions
preds_test = np.zeros((test.shape[0], len(col)))
preds_test

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [18]:
#Loop through target columns fitting and predicting
for i, j in enumerate(col):
    print('Class:= '+j)
    logreg.fit(X_train_dtm,y_train[j])
    preds_test[:,i] = logreg.predict_proba(X_test_dtm)[:,1]
    

Class:= toxic
Class:= severe_toxic
Class:= obscene
Class:= threat
Class:= insult
Class:= identity_hate


In [19]:
preds_test

array([[  9.99995829e-01,   2.33734936e-01,   9.97250416e-01,
          7.24942438e-02,   9.89918711e-01,   3.63543301e-01],
       [  4.27607371e-02,   7.69055369e-03,   3.25128602e-02,
          1.25494604e-03,   3.54327413e-02,   1.07624051e-02],
       [  7.69549831e-02,   1.53152941e-02,   5.21660435e-02,
          5.17191714e-03,   6.43774139e-02,   1.41426437e-02],
       ..., 
       [  2.18496203e-03,   7.41423556e-04,   6.54198285e-03,
          1.60162108e-04,   4.29144620e-03,   1.58141307e-03],
       [  9.82627520e-03,   8.26729231e-04,   9.17588083e-03,
          2.17545769e-04,   6.00997448e-03,   6.74334432e-03],
       [  5.87115122e-01,   1.03843067e-02,   1.01775701e-01,
          4.06029051e-03,   1.06415140e-01,   1.88416760e-02]])

In [20]:
preds_test.shape

(153164, 6)

### Create submission file

In [21]:
submid = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds_test, columns = col)], axis=1)

In [22]:
submission.id = submission.id.astype(str)

In [23]:
submission.to_csv('C:/Users/mtrig/Documents/data science/toxic comments/data/submission.csv', index = False)

In [24]:
submission.tail()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
153159,fffcd0960ee309b5,0.364032,0.015513,0.092734,0.002984,0.076505,0.015186
153160,fffd7a9a6eb32c16,0.073349,0.005386,0.082327,0.002471,0.057624,0.018337
153161,fffda9e8d6fafa9e,0.002185,0.000741,0.006542,0.00016,0.004291,0.001581
153162,fffe8f1340a79fc2,0.009826,0.000827,0.009176,0.000218,0.00601,0.006743
153163,ffffce3fb183ee80,0.587115,0.010384,0.101776,0.00406,0.106415,0.018842
