In [66]:
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
train_file = "train.csv"
test_file = "test.csv"

In [67]:
data = pd.read_csv(train_file, header=0)
data.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
# https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer
# Above reference is for understanding the parameters of tf-idf.

tf = TfidfVectorizer(analyzer='word',lowercase=True, min_df = 2,max_df = 0.9,
                     strip_accents='unicode', stop_words = 'english', sublinear_tf=1)

X =  tf.fit_transform(data["comment_text"])
print("The number of features after applying TF-IDF are:")
print(len(tf.get_feature_names()))
print("The shape of the data is:")
print(X.shape)
print("Ten sample features are:")
tf.get_feature_names()[:10]

The number of features after applying TF-IDF are:
74583
The shape of the data is:
(159571, 74583)
Ten sample features are:


['00',
 '000',
 '0000',
 '00000',
 '000000',
 '0000z',
 '0001',
 '0003',
 '000_bucks',
 '000ft']

In [69]:
# Read the test file
test_data = pd.read_csv(test_file)
test_data.tail(5)

Unnamed: 0,id,comment_text
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."
153163,ffffce3fb183ee80,""" \n :::Stop already. Your bullshit is not wel..."


In [70]:
X_test = tf.transform(test_data["comment_text"])
print(X_test.shape)

(153164, 74583)


In [72]:
# Predictions for toxic  using Random Forest
clf = RandomForestClassifier(max_depth=150, n_estimators=100)
toxic_labels = list(data["toxic"])
clf.fit(X, toxic_labels)
ans_probs = clf.predict_proba(X)
predictions = clf.predict(X)
print("Log Loss: "+str(log_loss(toxic_labels, ans_probs)))
accuracy = round(accuracy_score(toxic_labels, predictions) *100,2)
print("Accuracy: "+str(accuracy))
confusion_matrix(toxic_labels, predictions)

Log Loss: 0.11185005179726798
Accuracy: 95.97


array([[144275,      2],
       [  6434,   8860]])

In [73]:
doc = 0
feature_names = tf.get_feature_names()
feature_index = X[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

205 0.22947222656189137
27 0.17834616234619227
38 0.20363420516848946
89 0.21413990924274262
closure 0.2487963967573361
dolls 0.28749813675315267
don 0.0914176988445954
edits 0.11758377204966428
explanation 0.16556591073812327
fac 0.21480258987499007
fan 0.1841914893111529
gas 0.22059674408811816
hardcore 0.2432209231302952
just 0.08624045494546462
metallica 0.2791738288407949
new 0.11554577850801136
page 0.07950718847788701
remove 0.1320049003396883
retired 0.22073355658683494
reverted 0.1423831296890881
talk 0.08044249121071409
template 0.15023574657404387
username 0.1694647006622534
vandalisms 0.2762286662160805
voted 0.21349206599775641
weren 0.20067679562710994
york 0.1865334584329208


In [74]:
toxic_test = clf.predict_proba(X_test)

test_data["toxic"] = pd.Series(toxic_test[:,1])

In [75]:
# Predictions for severe_toxic using Random Forest
clf = RandomForestClassifier(max_depth=150, n_estimators=100)
severe_toxic_labels = list(data["severe_toxic"])
clf.fit(X, severe_toxic_labels)
ans_probs = clf.predict_proba(X)
predictions = clf.predict(X)
print("Log Loss: "+str(log_loss(severe_toxic_labels, ans_probs)))
accuracy = round(accuracy_score(severe_toxic_labels, predictions) *100,2)
print("Accuracy: "+str(accuracy))
confusion_matrix(severe_toxic_labels, predictions)

Log Loss: 0.013691943145166004
Accuracy: 99.39


array([[157974,      2],
       [   972,    623]])

In [76]:
severe_toxic_test = clf.predict_proba(X_test)

test_data["severe_toxic"] = pd.Series(severe_toxic_test[:,1])

In [77]:
# Predictions for obscene using Random Forest
clf = RandomForestClassifier(max_depth=150, n_estimators=100)
obscene_labels = list(data["obscene"])
clf.fit(X, obscene_labels)
ans_probs = clf.predict_proba(X)
predictions = clf.predict(X)
print("Log Loss: "+str(log_loss(obscene_labels, ans_probs)))
accuracy = round(accuracy_score(obscene_labels, predictions) *100,2)
print("Accuracy: "+str(accuracy))
confusion_matrix(obscene_labels, predictions)

Log Loss: 0.05428907765810611
Accuracy: 98.14


array([[151121,      1],
       [  2961,   5488]])

In [78]:
obscene_test = clf.predict_proba(X_test)

test_data["obscene"] = pd.Series(obscene_test[:,1])

In [79]:
# Predictions for threat using Random Forest
clf = RandomForestClassifier(max_depth=150, n_estimators=100)
threat_labels = list(data["threat"])
clf.fit(X, threat_labels)
ans_probs = clf.predict_proba(X)
predictions = clf.predict(X)
print("Log Loss: "+str(log_loss(threat_labels, ans_probs)))
accuracy = round(accuracy_score(threat_labels, predictions) *100,2)
print("Accuracy: "+str(accuracy))
confusion_matrix(threat_labels, predictions)

Log Loss: 0.0030608096432204763
Accuracy: 99.9


array([[159090,      3],
       [   154,    324]])

In [80]:
threat_test = clf.predict_proba(X_test)
test_data["threat"] = pd.Series(threat_test[:,1])

In [81]:
# Predictions for insult using Random Forest
clf = RandomForestClassifier(max_depth=150, n_estimators=100)
insult_labels = list(data["insult"])
clf.fit(X, insult_labels)
ans_probs = clf.predict_proba(X)
predictions = clf.predict(X)
print("Log Loss: "+str(log_loss(insult_labels, ans_probs)))
accuracy = round(accuracy_score(insult_labels, predictions) *100,2)
print("Accuracy: "+str(accuracy))
confusion_matrix(insult_labels, predictions)

Log Loss: 0.060321716314508875
Accuracy: 97.75


array([[151691,      3],
       [  3590,   4287]])

In [82]:
insult_test = clf.predict_proba(X_test)
test_data["insult"] = pd.Series(insult_test[:,1])

In [83]:
# Predictions for identity_hate using Random Forest
clf = RandomForestClassifier(max_depth=150, n_estimators=100)
identity_hate_labels = list(data["identity_hate"])
clf.fit(X, identity_hate_labels)
ans_probs = clf.predict_proba(X)
predictions = clf.predict(X)
print("Log Loss: "+str(log_loss(identity_hate_labels, ans_probs)))
accuracy = round(accuracy_score(identity_hate_labels, predictions) *100,2)
print("Accuracy: "+str(accuracy))
confusion_matrix(identity_hate_labels, predictions)

Log Loss: 0.01258264766398848
Accuracy: 99.4


array([[158165,      1],
       [   952,    453]])

In [84]:
identity_hate_test = clf.predict_proba(X_test)
test_data["identity_hate"] = pd.Series(identity_hate_test[:,1])

In [85]:
del test_data["comment_text"]

In [86]:
test_data.to_csv("submission_RF.csv", index=False) #0.9640 private  0.9658 public