In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
import xgboost as xgb
from xgboost.sklearn import XGBClassifier # <3
from sklearn.model_selection import train_test_split
import gc

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('data/train_preprocessed.csv').fillna(' ')#.sample(1000)
test = pd.read_csv('data/test_preprocessed.csv').fillna(' ')#.sample(1000)

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

train = train.loc[:,class_names]

print("TFIDF")
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=5000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
train_features = train_word_features
test_features = test_word_features



TFIDF


In [2]:
train_word_features.dtype

dtype('float64')

In [3]:
print train_word_features

  (0, 4984)	0.2504784615710792
  (0, 4869)	0.26779664550409343
  (0, 4804)	0.283750129022613
  (0, 4706)	0.2296680841471418
  (0, 4455)	0.20224101642594247
  (0, 4416)	0.11765901554568048
  (0, 4400)	0.16079684889021442
  (0, 3817)	0.19451556902700642
  (0, 3804)	0.30131684044582985
  (0, 3724)	0.1801293482891167
  (0, 3157)	0.11064731681371362
  (0, 2960)	0.15571948114354867
  (0, 2670)	0.1298635275477737
  (0, 2431)	0.11640652722842207
  (0, 2015)	0.32206581951897956
  (0, 1865)	0.2929240202813838
  (0, 1665)	0.24865058879375296
  (0, 1629)	0.2829002671200444
  (0, 1605)	0.22456772307716327
  (0, 1428)	0.16305025055919967
  (0, 1353)	0.1236750115146054
  (1, 4715)	0.2286559739672488
  (1, 4483)	0.189162952368473
  (1, 4416)	0.15702708654024028
  (1, 4303)	0.37082022717735424
  :	:
  (159568, 61)	0.3730302182092655
  (159569, 4756)	0.42650879821632404
  (159569, 4204)	0.44187891020311637
  (159569, 2643)	0.4354434405082687
  (159569, 2640)	0.3374359504675947
  (159569, 2584)	0.2494916

In [4]:
print(train_features.type)
print(test_features.type)

AttributeError: type not found

In [5]:
d_test = xgb.DMatrix(test_features)
del test_features
gc.collect()

print("Modeling")
cv_scores = []
xgb_preds = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    # Split out a validation set
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_features, train_target, test_size=0.25, random_state=23)

    xgb_params = {'eta': 0.3, 
              'max_depth': 5, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
              'seed': 42
             }

    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)

    watchlist = [(d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 200, watchlist, verbose_eval=False, early_stopping_rounds=30)
    print("class Name: {}".format(class_name))
    print(model.attributes()['best_msg'])
    cv_scores.append(float(model.attributes()['best_score']))
    submission[class_name] = model.predict(d_test)
    del X_train, X_valid, y_train, y_valid
    gc.collect()
print('Total CV score is {}'.format(np.mean(cv_scores)))

Modeling
class Name: toxic
[191]	valid-auc:0.963856
class Name: severe_toxic
[61]	valid-auc:0.986941
class Name: obscene
[192]	valid-auc:0.988981
class Name: threat
[38]	valid-auc:0.970612
class Name: insult
[109]	valid-auc:0.974105
class Name: identity_hate
[127]	valid-auc:0.965331
Total CV score is 0.974971


In [6]:
submission.to_csv('submissionMar16.csv', index=False)

In [7]:
submission.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,153164.0,153164.0,153164.0,153164.0,153164.0,153164.0
mean,0.201579,0.019184,0.125718,0.004649,0.103635,0.01759101
std,0.331414,0.080079,0.284637,0.040392,0.228471,0.0911985
min,4.3e-05,6e-06,7e-06,5e-06,3.7e-05,4.790903e-07
25%,0.010322,0.000435,0.001944,0.000291,0.004451,0.0003620256
50%,0.02932,0.000929,0.004784,0.000603,0.010642,0.001062594
75%,0.182033,0.001441,0.016337,0.000816,0.032143,0.002246641
max,1.0,0.994768,1.0,0.998434,0.999986,0.9999266


In [11]:
submission.loc[submission['id'] == '0114509409588767']

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
647,114509409588767,0.250396,0.000893,0.04317,0.000476,0.30317,0.002893


In [12]:
blend_submission = pd.read_csv('data/preprocessed_blend.csv')

In [13]:
blend_submission.loc[blend_submission['id'] == '0114509409588767']

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
647,114509409588767,0.402429,0.021176,0.103199,0.013444,0.230419,0.017473


In [14]:
a = submission 
b = blend_submission

In [15]:
del a['toxic']

In [16]:
del a['threat']
del a['insult']
del a['identity_hate']

del b['severe_toxic']
del b['obscene']
del b['id']

In [23]:
out = pd.concat([a, b], axis=1)

In [24]:
out.head()

Unnamed: 0,id,severe_toxic,obscene,toxic,threat,insult,identity_hate
0,00001cee341fdb12,0.144039,0.997231,0.907373,0.068772,0.824949,0.437925
1,0000247867823ef7,0.001215,0.003469,0.010137,0.006929,0.00632,0.015238
2,00013b17ad220c46,0.000929,0.006413,0.011127,0.008329,0.005232,0.012861
3,00017563c3f7919a,0.000546,0.001753,0.009389,0.008886,0.005881,0.013462
4,00017695ad8997eb,0.000831,0.006332,0.015261,0.006829,0.007079,0.01326


In [25]:
cols = list(out.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('severe_toxic')) #Remove b from list
cols.pop(cols.index('obscene')) #Remove x from list
cols.pop(cols.index('threat'))
cols.pop(cols.index('insult')) #Remove b from list
cols.pop(cols.index('identity_hate'))

out = out[cols+['severe_toxic','obscene','threat','insult','identity_hate']] #Create new dataframe with columns in the order you want

In [26]:
out.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.907373,0.144039,0.997231,0.068772,0.824949,0.437925
1,0000247867823ef7,0.010137,0.001215,0.003469,0.006929,0.00632,0.015238
2,00013b17ad220c46,0.011127,0.000929,0.006413,0.008329,0.005232,0.012861
3,00017563c3f7919a,0.009389,0.000546,0.001753,0.008886,0.005881,0.013462
4,00017695ad8997eb,0.015261,0.000831,0.006332,0.006829,0.007079,0.01326


In [28]:
out.to_csv('submissionMar16A.csv', index=False)