In [63]:
import os
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [64]:
data = pd.read_csv('../../train.csv/train.csv')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [65]:
def clean(line):
    newline = line.strip().lower().replace('\n', ' ')
    words = re.split(r'\W+', newline)
    filter_table = str.maketrans('', '', string.punctuation)
    clean_words = [w.translate(filter_table) for w in words if len(w.translate(filter_table))]
    return clean_words

In [66]:
data.shape

(159571, 8)

In [67]:
X_data = data['comment_text'].apply(clean).values

In [68]:
Y_data_columns = data.drop(['comment_text','id'], axis=1).columns
Y_data = data.drop(['comment_text','id'], axis=1).values

In [69]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(X_data, Y_data, test_size=0.2, random_state=0)

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer1 = CountVectorizer(tokenizer=lambda x:x, preprocessor=lambda x:x)
vectorized1_train_x = vectorizer1.fit_transform(trainX)
vectorized1_test_x = vectorizer1.transform(testX)

In [71]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
class NaiveBayer(object):
    def __init__(self, classes):
        self.models = {}
        self.classes = classes
        for cls in self.classes:
            model = MultinomialNB()
            self.models[cls] = model

    def fit(self, train_x, train_y):
        print(self.classes)
        for idx, cls in enumerate(self.classes):
            print(idx, cls)
            class_labels = train_y[:,idx]
            self.models[cls].fit(train_x, class_labels)

    def fit_and_validate(self, train_x, train_y, validate_x, validate_y):
        self.fit(train_x, train_y)
        return self.predict(validate_x), None

    def predict(self, test_x):
        predictions = np.zeros((test_x.shape[0], len(self.classes)))
        for idx, cls in enumerate(self.classes):
            predictions[:, idx] = self.models[cls].predict(test_x)
        return predictions
    def predict_prob(self, test_x):
        probs = np.zeros((test_x.shape[0], len(self.classes)))
        for idx, cls in enumerate(self.classes):
            probs[:, idx] = self.models[cls].predict_proba(test_x)[:,1]
        return probs

In [72]:
classifier = NaiveBayer(Y_data_columns)

In [73]:
classifier.fit(vectorized1_train_x, trainY)

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')
0 toxic
1 severe_toxic
2 obscene
3 threat
4 insult
5 identity_hate


In [74]:
type(vectorized1_train_x)

scipy.sparse.csr.csr_matrix

In [75]:
predictions = classifier.predict(vectorized1_test_x)

In [76]:
accuracy = accuracy_score(testY, predictions)

In [77]:
accuracy

0.900736330878897

In [78]:
cls_report = classification_report(testY, predictions, zero_division=1)

In [79]:
print(cls_report)

              precision    recall  f1-score   support

           0       0.78      0.63      0.69      3101
           1       0.40      0.42      0.41       329
           2       0.72      0.63      0.67      1698
           3       0.15      0.04      0.07        91
           4       0.67      0.57      0.61      1594
           5       0.33      0.13      0.19       298

   micro avg       0.70      0.58      0.63      7111
   macro avg       0.51      0.40      0.44      7111
weighted avg       0.69      0.58      0.63      7111
 samples avg       0.97      0.95      0.93      7111



In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(
    #ngram_range = (1,3),
    use_idf = 1,
    smooth_idf = 1,
    #stop_words = 'english',
    tokenizer=lambda x:x, 
    preprocessor=lambda x:x
)

In [81]:
vectorized2_train_x = vectorizer2.fit_transform(trainX)
vectorized2_test_x = vectorizer2.transform(testX)

In [82]:
classifier2 = NaiveBayer(Y_data_columns)

In [83]:
classifier2.fit(vectorized2_train_x, trainY)

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')
0 toxic
1 severe_toxic
2 obscene
3 threat
4 insult
5 identity_hate


In [84]:
predictions2 = classifier2.predict(vectorized2_test_x)

In [85]:
accuracy2 = accuracy_score(testY, predictions2)

In [86]:
accuracy2

0.8971956760144133

In [87]:
cls_report2 = classification_report(testY, predictions2, zero_division=1)

In [88]:
print(cls_report2)

              precision    recall  f1-score   support

           0       0.99      0.14      0.25      3101
           1       0.00      0.00      0.00       329
           2       0.99      0.08      0.15      1698
           3       1.00      0.01      0.02        91
           4       0.88      0.03      0.05      1594
           5       0.00      0.00      0.00       298

   micro avg       0.98      0.09      0.16      7111
   macro avg       0.64      0.04      0.08      7111
weighted avg       0.88      0.09      0.16      7111
 samples avg       1.00      0.90      0.90      7111



In [89]:
predictData = pd.read_csv("../../test.csv/test.csv")

In [90]:
predictData.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [91]:
predictX = predictData['comment_text'].apply(clean).values

In [99]:
 def save_result(output_path, test_ids, probs):
        with open(output_path, 'w') as output_csv_file:
             header = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']
             writer = csv.writer(output_csv_file)
             writer.writerow(header)
             for test_id, prob in zip(test_ids, probs.tolist()):
                 writer.writerow([test_id] + prob)

In [100]:
test_ids = predictData.id.values

In [101]:
type(test_ids)

numpy.ndarray

In [102]:
vectorized1_predict_x = vectorizer1.transform(predictX)
probs = classifier.predict_prob(vectorized1_predict_x)

In [104]:
import csv
save_result("./output.csv", test_ids, probs)

In [111]:
output = pd.read_csv('./output.csv')

In [112]:
output.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.0,7.405289e-17,1.0,6.6233880000000005e-43,1.0,1.016841e-16
1,0000247867823ef7,4.360101e-05,6.542264e-12,8.086729e-07,3.013321e-16,3.592789e-07,4.062722e-11
2,00013b17ad220c46,0.06652962,0.001513413,0.05113488,0.0001065353,0.05097021,0.005731068
3,00017563c3f7919a,9.859798e-11,4.2195469999999995e-34,2.602332e-15,8.779592e-47,2.458794e-16,4.212849e-35
4,00017695ad8997eb,0.005006149,1.687772e-07,0.0005941361,3.538871e-10,0.0003362592,3.428668e-08
