In [1]:
import numpy as np
import pandas as pd

import re
import pickle

import sklearn
from  sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.sparse import hstack
from scipy.special import logit, expit

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv(r'cleaned_train.csv').fillna(' ')
test = pd.read_csv(r'cleaned_test.csv').fillna(' ')

list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']
all_text = pd.concat([list_sentences_train, list_sentences_test])

In [4]:
cl_path = 'cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"iâ€™m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))

In [5]:
from time import time

print("Extracting tf-idf features for NMF...")
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000)


t0 = time()
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 27.017s.


In [6]:
#pickle the file
with open('tfidf_word_01.pkl', 'wb') as picklefile:
    pickle.dump(word_vectorizer, picklefile)

In [7]:
print("Extracting tf-idf features for NMF...")
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 6),
    max_features=30000)

t0 = time()
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 667.102s.


In [9]:
#pickle the file
with open('tfidf_char_01.pkl', 'wb') as picklefile:
    pickle.dump(char_vectorizer, picklefile)

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

losses = []
predictions = {'id': test['id']}
t0=time()
for class_name in class_names:
    train_target = train[class_name]
    test_target = test[class_name]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(train_features, train_target)
    classifier.fit(test_features, test_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
    cv_loss_train = np.mean(cross_val_score(classifier, train_features, train_target, cv=5, scoring='roc_auc'))
    losses.append(cv_loss_train)
    print('train CV score for class {} is {}'.format(class_name, cv_loss_train))
    cv_loss_test = roc_auc_score(test_target,predictions[class_name])
    print('test score for class {} is {}'.format(class_name, cv_loss_test))
print("done in %0.3fs." % (time() - t0))

train CV score for class toxic is 0.9791055078036223
test score for class toxic is 0.9868693813136472
train CV score for class severe_toxic is 0.9883363997947457
test score for class severe_toxic is 0.9955662904600198
train CV score for class obscene is 0.9905734182993309
test score for class obscene is 0.9931271697189329
train CV score for class threat is 0.9899720067669303
test score for class threat is 0.9987019538029334
train CV score for class insult is 0.9830951343298064
test score for class insult is 0.9908470855839644
train CV score for class identity_hate is 0.9829026955457236
test score for class identity_hate is 0.997002956484428
done in 2114.837s.


In [13]:
with open('classifier_02.pkl', 'wb') as picklefile:
    pickle.dump(classifier, picklefile)