In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv("../input/train.csv")

#### Head of Data

In [None]:
data.head(25)

### Class Distribution

In [None]:
data[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].sum()

In [None]:
import sklearn

In [None]:
msk = np.random.rand(len(data)) < 0.8

In [None]:
train = data[msk]
test = data[~msk]

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train[['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']].mean()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vect = CountVectorizer()

In [None]:
X_train = count_vect.fit_transform(train[['comment_text']].as_matrix().reshape((-1, )))

In [None]:
X_train.shape

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train)

In [None]:
X_train_tfidf.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, train[['toxic']].as_matrix().reshape((-1,)))

In [None]:
X_test_tfidf = tfidf_transformer.transform(count_vect.transform(test[['comment_text']].as_matrix().reshape((-1,))))
X_test_tfidf.shape

In [None]:
y_ = clf.predict(X_test_tfidf)
y_prob = clf.predict_proba(X_test_tfidf)

In [None]:
y = test[['toxic']].as_matrix().reshape((-1,))
(y_ == y).astype('float').mean()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
print (classification_report(y, y_))
print(confusion_matrix(y, y_))
print(roc_auc_score(y, y_))
print(roc_auc_score(y, y_prob[:, 1]))

In [None]:
# Accuracy at predicting toxic language
# Sampling bias obviously is skewing the results
476/2552

In [None]:
# This is definitely hate speech
#print(test[['comment_text']][(y_ == 0) & (y == 1)].as_matrix()[0][0])

In [None]:
# try rebuilding classifier with weighted classes
y_train = train[['toxic']].as_matrix().reshape((-1,))
pos_cases = (y_train == 1)
num_pos_cases = pos_cases.sum()
idx_desc = np.argsort(-y_train) #sort indices high to low so positive cases are at the front
balance_samples = idx_desc[:2*num_pos_cases] #extract 2x the num pos cases so now 50/50 pos/neg
print(y_train[balance_samples].mean()) #confirm probability
y_train_bal = y_train[balance_samples]
X_train_tfidf_bal = X_train_tfidf[balance_samples]
print(y_train_bal.shape, X_train_tfidf_bal.shape)
#neg_cases = (y_train == 0)[:(pos_cases.sum())]
#clf = MultinomialNB(cl) 
#.fit(X_train_tfidf, y_train)

In [None]:
# Create classifier based on balanced classes
clf = MultinomialNB().fit(X_train_tfidf_bal, y_train_bal)
y_ = clf.predict(X_test_tfidf)
y_prob = clf.predict_proba(X_test_tfidf)
print (classification_report(y, y_))
print(confusion_matrix(y, y_))
print(roc_auc_score(y, y_prob[:, 1]))

In [None]:
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=100, random_state=42)
svd.fit(X_train_tfidf)

In [None]:
X_train_tfidf_bal_svd = svd.transform(X_train_tfidf_bal)

In [None]:
print(X_train_tfidf_bal_svd)

In [None]:
from sklearn.svm import SVC

In [None]:
clf_svc = SVC(probability = True).fit(X_train_tfidf_bal_svd, y_train_bal)


In [None]:
X_test_tfidf_svd = svd.transform(X_test_tfidf)
y_ = clf_svc.predict(X_test_tfidf_svd)
y_prob = clf_svc.predict_proba(X_test_tfidf_svd)

print (classification_report(y, y_))
print(confusion_matrix(y, y_))
print(roc_auc_score(y, y_prob[:,1]))