In [15]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import gc
from scipy.sparse import hstack
from collections import defaultdict

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [18]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [19]:
pd.options.display.max_colwidth = 9000

In [20]:
predictClasses = [ u'toxic', u'severe_toxic', u'obscene',
       u'threat', u'insult', u'identity_hate']

In [21]:
replacement_patterns = [
  (r'won\'t', 'will not'),
  (r'can\'t', 'cannot'),
  (r'don\'t', 'do not'),
  (r'i\'m', 'i am'),
  (r'ain\'t', 'is not'),
  (r'(\w+)\'ll', '\g<1> will'),
  (r'(\w+)n\'t', '\g<1> not'),
  (r'(\w+)\'ve', '\g<1> have'),
  (r'(\w+)\'s', '\g<1> is'),
  (r'(\w+)\'re', '\g<1> are'),
  (r'(\w+)\'d', '\g<1> would')
]
patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]

In [22]:
def cleanup(s):
    # Replace Contractions
    
    for (reg,rep) in patterns:
        
        cleaned = re.sub(reg,rep,s.lower())
    
    # Remove numbers,?,\n ,",: and -
    #cleaned = re.sub('[\d|/\n|/"|/:|/-]+'," ",cleaned)
    cleaned = re.sub('[\d|/\n]+'," ",cleaned)
    # Remove . and comma
    #cleaned = re.sub("[/.|/,|/=]+"," ",cleaned)
    return cleaned.strip()

data_train['comment_text'] = data_train['comment_text'].map(cleanup)
data_test['comment_text'] = data_test['comment_text'].map(cleanup)

In [23]:
master_comment_data = pd.concat([data_train['comment_text'],data_test['comment_text']])

In [24]:
word_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            token_pattern=r'\w{1,}',
            stop_words='english',
            ngram_range=(1, 3),
            max_features=20000)
word_vectorizer.fit(master_comment_data)

TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [25]:
train_word_features = word_vectorizer.transform(data_train['comment_text'])
test_word_features = word_vectorizer.transform(data_test['comment_text'])

In [26]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(master_comment_data)

TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(2, 6), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [27]:
train_char_features = char_vectorizer.transform(data_train['comment_text'])
test_char_features = char_vectorizer.transform(data_test['comment_text'])

In [28]:
csr_train = hstack(
            [
                train_word_features,
                train_char_features
            ]
        ).tocsr()

csr_test = hstack(
            [
                test_word_features,
                test_char_features
            ]
        ).tocsr()

In [29]:
scores = []
submission = pd.DataFrame.from_dict({'id': data_test['id']})
for class_name in predictClasses:
    train_target = data_train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(classifier, csr_train, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(csr_train, train_target)
    submission[class_name] = classifier.predict_proba(csr_test)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.978360686735
CV score for class severe_toxic is 0.988328512721
CV score for class obscene is 0.990149783973
CV score for class threat is 0.989564721415
CV score for class insult is 0.982674962066
CV score for class identity_hate is 0.982392633589
Total CV score is 0.98524521675
