In [120]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import gc
from scipy.sparse import hstack
from collections import defaultdict

In [92]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [4]:
pd.options.display.max_colwidth = 9000

In [121]:
predictClasses = [ u'toxic', u'severe_toxic', u'obscene',
       u'threat', u'insult', u'identity_hate']

In [93]:
replacement_patterns = [
  (r'won\'t', 'will not'),
  (r'can\'t', 'cannot'),
  (r'don\'t', 'do not'),
  (r'i\'m', 'i am'),
  (r'ain\'t', 'is not'),
  (r'(\w+)\'ll', '\g<1> will'),
  (r'(\w+)n\'t', '\g<1> not'),
  (r'(\w+)\'ve', '\g<1> have'),
  (r'(\w+)\'s', '\g<1> is'),
  (r'(\w+)\'re', '\g<1> are'),
  (r'(\w+)\'d', '\g<1> would')
]
patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]

In [100]:
def cleanup(s):
    # Replace Contractions
    
    for (reg,rep) in patterns:
        
        cleaned = re.sub(reg,rep,s.lower())
    
    # Remove numbers,?,\n ,",: and -
    cleaned = re.sub('[\d|/\n|/"|/:|/-]+'," ",cleaned)
    # Remove . and comma
    cleaned = re.sub("[/.|/,|/=]+"," ",cleaned)
    return cleaned.strip()

data_train['comment_text'] = data_train['comment_text'].map(cleanup)
data_test['comment_text'] = data_test['comment_text'].map(cleanup)

In [103]:
master_comment_data = pd.concat([data_train['comment_text'],data_test['comment_text']])

In [104]:
len(master_comment_data)

312735

In [106]:
word_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            token_pattern=r'\w{1,}',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=20000)
word_vectorizer.fit(master_comment_data)

TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [108]:
train_word_features = word_vectorizer.transform(data_train['comment_text'])
test_word_features = word_vectorizer.transform(data_test['comment_text'])

In [145]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(master_comment_data)

TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(2, 6), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [148]:
train_char_features = char_vectorizer.transform(data_train['comment_text'])
test_char_features = char_vectorizer.transform(data_test['comment_text'])

In [151]:
print "sss1"

sss1


In [150]:
csr_train = hstack(
            [
                train_word_features,
                train_char_features
            ]
        ).tocsr()

csr_test = hstack(
            [
                test_word_features,
                test_char_features
            ]
        ).tocsr()

In [152]:
# Set LGBM parameters
params = {
        "objective": "binary",
        'metric': {'auc'},
        "boosting_type": "gbdt",
        "verbosity": -1,
        "num_threads": 4,
        "bagging_fraction": 0.8,
        "feature_fraction": 0.8,
        "learning_rate": 0.1,
        "num_leaves": 31,
        "verbose": -1,
        "min_split_gain": .1,
        "reg_alpha": .1
    }

In [None]:
scores = []
class_pred = np.zeros(len(data_train))
folds = KFold(n_splits=4, shuffle=True, random_state=1)
lgb_round_dict = defaultdict(int)
trn_lgbset = lgb.Dataset(csr_train, free_raw_data=False)        
for class_name in predictClasses:
    print "class - {0} scores".format(class_name)
    train_target = data_train[class_name]
    trn_lgbset.set_label(train_target.values)
    lgb_rounds = 500
    
    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data_train, train_target)):
            watchlist = [
                    trn_lgbset.subset(trn_idx),
                    trn_lgbset.subset(val_idx)
                ]
            # Train lgb l1
            model = lgb.train(
                    params=params,
                    train_set=watchlist[0],
                    num_boost_round=lgb_rounds,
                    valid_sets=watchlist,
                    early_stopping_rounds=50,
                    verbose_eval=0
                )
            class_pred[val_idx] = model.predict(trn_lgbset.data[val_idx], num_iteration=model.best_iteration)
            score = roc_auc_score(train_target.values[val_idx], class_pred[val_idx])
                
            # Compute mean rounds over folds for each class
            # So that it can be re-used for test predictions
            lgb_round_dict[class_name] += model.best_iteration
            print("\t Fold %d : %.6f in %3d rounds" % (n_fold + 1, score, model.best_iteration))
                   

class - toxic scores


In [138]:
np.mean(score)

0.9628734535827312

In [140]:
submission = pd.DataFrame.from_dict({'id': data_test['id']})

In [143]:
for class_name in predictClasses:
    
    train_target = data_train[class_name]
    trn_lgbset.set_label(train_target.values)
    # Train lgb
    model = lgb.train(
                    params=params,
                    train_set=trn_lgbset,
                    num_boost_round=int(lgb_round_dict[class_name] / folds.n_splits)
                )
    submission[class_name] = model.predict(csr_test, num_iteration=model.best_iteration)

submission.to_csv("lgbm_trial1.csv", index=False, float_format="%.8f")

In [144]:
print "hai"

hai
