In [19]:
BUCKET = 'cpb100-200708.appspot.com'
PROJECT = 'cpb100'
REGION = 'us-central1'

In [20]:
import os
import pandas as pd
import numpy as np
import google.datalab as datalab
import google.datalab.ml as ml


In [None]:
# Loading Data
train_x = pd.read_csv('train_preprocessed.csv').fillna(" ")
test_x = pd.read_csv('test_preprocessed.csv').fillna(" ")

In [None]:
train_text = train_x['comment_text']
test_text = test_x['comment_text']
all_text = pd.concat([train_text, test_text])

In [52]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)


word_vectorizer.fit(all_text)
train_features = word_vectorizer.transform(train_text)
test_features = word_vectorizer.transform(test_text)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [41]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.10, solver='sag', penalty='l2', random_state=0)

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.9548277978230575
CV score for class severe_toxic is 0.9826409307079059
CV score for class obscene is 0.9726766421702265
CV score for class threat is 0.9765599064070264
CV score for class insult is 0.9668623185208057
CV score for class identity_hate is 0.9582955822617715
Total CV score is 0.968643862981799


In [53]:
params = {'C':[0.1, 1., 10., 100.], 'penalty':['l1', 'l2']}
from sklearn.model_selection import GridSearchCV


In [54]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
#for class_name in class_names:
train_target = train[class_name]
classifier = LogisticRegression(random_state=0)

clf = GridSearchCV(classifier, params, cv=2)
clf.fit(train_features, train_target)

GridSearchCV(cv=2, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 10.0, 100.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [57]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [67]:
!pip install fasttext

Collecting fasttext
  Using cached fasttext-0.8.3.tar.gz
Building wheels for collected packages: fasttext
  Running setup.py bdist_wheel for fasttext ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / done
[?25h  Stored in directory: /content/.cache/pip/wheels/55/0a/95/e23f773666d3487ee7456b220f7e8d37e99b74833b20dd06a0
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.8.3
[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [25]:
import fasttext

In [26]:
train_x.columns = ['id', 'comment_text', '__label__toxic', '__label__severe_toxic', '__label__obscene', 
                 '__label__threat', '__label__insult', '__label__identity_hate']

In [27]:
train_data_column = ['comment_text', '__label__toxic', '__label__severe_toxic', '__label__obscene', 
                 '__label__threat', '__label__insult', '__label__identity_hate']

In [28]:
train_data = train_x[train_data_column]

In [29]:
train_data.head()

Unnamed: 0,comment_text,__label__toxic,__label__severe_toxic,__label__obscene,__label__threat,__label__insult,__label__identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [39]:
test_data = test_x['comment_text']

In [33]:
fmt='%s, %d, %d, %d, %d, %d, %d'
np.savetxt('train.txt', train_data.values, fmt=fmt, delimiter="\t", header="comment_text\t__label__toxic\t__label__severe_toxic\t__label__obscene\t__label__threat\t__label__insult\t__label__identity_hat") 

In [40]:
np.savetxt('test.txt', test_data.values, fmt='%s', delimiter="\t", header="comment_text")

In [56]:
classifier = fasttext.supervised('train.txt', 'model', label_prefix='__label__', lr=0.1, loss="softmax", epoch=5, thread=10)

In [41]:
result = classifier.test('test.txt')

In [42]:
result.precision

nan

In [57]:
classifier.predict_proba('test.txt', 6)

[[('obscene', 0.166016),
  ('insult', 0.166016),
  ('severe_toxic', 0.166016),
  ('toxic', 0.166016),
  ('threat', 0.166016),
  ('identity_hat', 0.166016)],
 [('obscene', 0.166016),
  ('insult', 0.166016),
  ('severe_toxic', 0.166016),
  ('toxic', 0.166016),
  ('threat', 0.166016),
  ('identity_hat', 0.166016)],
 [('obscene', 0.166016),
  ('insult', 0.166016),
  ('severe_toxic', 0.166016),
  ('toxic', 0.166016),
  ('threat', 0.166016),
  ('identity_hat', 0.166016)],
 [('obscene', 0.166016),
  ('insult', 0.166016),
  ('severe_toxic', 0.166016),
  ('toxic', 0.166016),
  ('threat', 0.166016),
  ('identity_hat', 0.166016)],
 [('obscene', 0.166016),
  ('insult', 0.166016),
  ('severe_toxic', 0.166016),
  ('toxic', 0.166016),
  ('threat', 0.166016),
  ('identity_hat', 0.166016)],
 [('obscene', 0.166016),
  ('insult', 0.166016),
  ('severe_toxic', 0.166016),
  ('toxic', 0.166016),
  ('threat', 0.166016),
  ('identity_hat', 0.166016)],
 [('obscene', 0.166016),
  ('insult', 0.166016),
  ('sever

In [52]:
labels

[[('toxic', 0.166016)],
 [('toxic', 0.166016)],
 [('toxic', 0.166016)],
 [('toxic', 0.166016)],
 [('toxic', 0.166016)],
 [('toxic', 0.166016)],
 [('toxic', 0.166016)],
 [('toxic', 0.166016)]]