In [64]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csc_matrix
import numpy as np

In [65]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train['comment_text'] = train['comment_text'].astype(str)
test['comment_text'] = test['comment_text'].astype(str)
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(train.shape, test.shape)
len(test['comment_text'].head().iloc[:].tolist())

(95851, 8) (226998, 2)


5

In [66]:
documents = pd.concat([train['comment_text'], test['comment_text']], ignore_index=True)
documents.fillna('none')
print(documents.shape)
documents.head()

(322849,)


0    Nonsense?  kiss off, geek. what I said is true...
1    "\n\n Please do not vandalize pages, as you di...
2    "\n\n ""Points of interest"" \n\nI removed the...
3    Asking some his nationality is a Racial offenc...
4    The reader here is not going by my say so for ...
Name: comment_text, dtype: object

In [17]:
documents[148151]

'nan'

In [68]:
trans = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
trans.fit(documents)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [69]:
#cls = XGBClassifier(n_estimators=300, silent=False, nthread=4, max_depth=10, objective='multi:softmax')
cls = XGBClassifier(n_estimators=300, silent=False, nthread=4, max_depth=10, objective='multi:softprob')

In [70]:
def get_numbered_label(data, categories):
    label = pd.Series([0]*data.shape[0])
    for i, category in enumerate(categories):
        label[data[category] == 1] = i+1
    data['label'] = label
get_numbered_label(train, categories)

In [None]:
def transform(data):
    X = data['comment_text']
    X = trans.transform(X)
    X = csc_matrix(X)
    Y = data['label'].values if 'label' in data.columns else None
    return X, Y

In [None]:
train = train.sample(frac=1.0)
#train = train.iloc[:10000, :]
train_size = int(train.shape[0]*0.8)
train, val = train.iloc[:train_size, :], train.iloc[train_size:, :]
train_X, train_Y = transform(train)
val_X, val_Y= transform(val)
print(train.shape, val.shape)

cls.fit(train_X, train_Y)
pred = cls.predict(val_X)
print(precision_score(val_Y, pred), recall_score(val_Y, pred), f1_score(val_Y, pred))

(61344, 9) (15336, 9)


In [72]:
train_X, train_Y = transform(train)
#val_X, val_Y= transform(val)
test_X, test_Y = transform(test)

In [73]:
cls.fit(train_X, train_Y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=300, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [74]:
pred = cls.predict_proba(test_X)
pred.shape
#print(precision_score(val_Y, pred), recall_score(val_Y, pred), f1_score(val_Y, pred))

(226998, 7)

In [75]:
output = pd.DataFrame()
output['id'] = test['id']
for i in range(len(categories)):
    output[categories[i]] = pred[:, i+1]

In [76]:
output.to_csv('data/predict.csv', sep=',')