In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("../data/train.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
print("Percent of all comments:")

# how many comments are toxic?
percent_toxic = np.sum(train["toxic"] == 1) / len(train) * 100.0
print("Toxic:              {:05.2f}%".format(percent_toxic))

percent_severe_toxic = np.sum(train["severe_toxic"] == 1) / len(train) * 100.0
print("Severe Toxic:       {:05.2f}%".format(percent_severe_toxic))

percent_obscene = np.sum(train["obscene"] == 1) / len(train) * 100.0
print("Obscene:            {:05.2f}%".format(percent_obscene))

percent_threat = np.sum(train["threat"] == 1) / len(train) * 100.0
print("Threat:             {:05.2f}%".format(percent_threat))

percent_insult = np.sum(train["insult"] == 1) / len(train) * 100.0
print("Insult:             {:05.2f}%".format(percent_insult))

percent_identity_hate = np.sum(train["identity_hate"] == 1) / len(train) * 100.0
print("Identity Hate:      {:05.2f}%".format(percent_identity_hate))

Percent of all comments:
Toxic:              09.58%
Severe Toxic:       01.00%
Obscene:            05.29%
Threat:             00.30%
Insult:             04.94%
Identity Hate:      00.88%


In [5]:
import gensim

### doc2vec implementation

In [12]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Doc2Vec analysis - tokenization, cleanup, and vectorization

In [7]:
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
import random

np.random.seed(42)

In [8]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))
#This function does all cleaning of data using two objects above
def nlp_clean(data):
    new_data = []
    for d in data:
        new_str = d.lower()
        dlist = tokenizer.tokenize(new_str)
        dlist = list(set(dlist).difference(stopword_set))
        new_data.append(dlist)
    return new_data

In [9]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield TaggedDocument(doc, [self.labels_list[idx]])

In [10]:
data = train['comment_text']

In [11]:
data = nlp_clean(data)

In [13]:
labels = train['id']

In [14]:
labeled_data = LabeledLineSentence(data, labels)

In [15]:
model = Doc2Vec(vector_size=300, min_count=0, alpha=0.025, min_alpha=0.025, workers=4)
model.build_vocab(labeled_data)

In [16]:
xx = model.epochs

In [17]:
trained = model.train(labeled_data, total_words=model.corpus_count, total_examples=len(data), epochs=xx)

In [18]:
vectorized_comments = [model.docvecs[label] for label in labels]

In [19]:
len(model.wv.vocab)

190186

In [20]:
test = pd.read_csv('../data/test.csv')

In [21]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [22]:
test_com = [comm for comm in test['comment_text']]

In [23]:
test_com = nlp_clean(test_com)

In [24]:
test_vectors = [model.infer_vector(com) for com in test_com]

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import hstack

import time

In [28]:
scores = []
NUM_FOLDS = 10

train_features = vectorized_comments
# submission = pd.DataFrame.from_dict({'id': test['id']})

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    kfold = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
    
#     results = cross_val_score(classifier, train_features, train_target, cv=5, n_jobs=-1, scoring='roc_auc')
    results = cross_val_score(classifier, train_features, train_target, cv=kfold, scoring='roc_auc')
    
    print('CV Spread for class "{}":'.format(class_name))
    for result in results:
        print("    {:0.4f}".format(result), end=" ")
        
    print(" ")
        
    cv_score = np.mean(results)
    scores.append(cv_score)
    
    print('    CV score for class "{}" is {:0.4}\n'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
#     submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {:0.4f}'.format(np.mean(scores)))

write_model_timestamp('logistic regression', NUM_FOLDS, scores, "first model: logistic regression, word to vec max 5k features, kfold=10")

CV Spread for class "toxic":
    0.8913     0.8841     0.8850     0.8794     0.8869     0.8844     0.8808     0.8924     0.8840     0.8857  
    CV score for class "toxic" is 0.8854

CV Spread for class "severe_toxic":
    0.8441     0.8199     0.8670     0.8388     0.8136     0.8355     0.8293     0.8348     0.8160     0.8177  
    CV score for class "severe_toxic" is 0.8317

CV Spread for class "obscene":
    0.8919     0.8813     0.8874     0.8834     0.8853     0.8884     0.8798     0.8852     0.8791     0.8783  
    CV score for class "obscene" is 0.884

CV Spread for class "threat":
    0.8954     0.9038     0.8923     0.8689     0.8366     0.8864     0.8853     0.8655     0.8833     0.8822  
    CV score for class "threat" is 0.88

CV Spread for class "insult":
    0.8796     0.8750     0.8861     0.8755     0.8827     0.8713     0.8759     0.8781     0.8780     0.8795  
    CV score for class "insult" is 0.8782

CV Spread for class "identity_hate":
    0.8662     0.8854     0.8

NameError: name 'write_model_timestamp' is not defined

In [29]:
# model.save("toxic_comments_gs_model")