In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn.cross_validation import train_test_split



In [2]:
import re
from unidecode import unidecode

In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
df = pd.read_csv('toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [5]:
for i in range(df.shape[0]):
    df.iloc[i,0] = textcleaning(df.iloc[i,0])

In [10]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(df.iloc[:,0])
word_features = word_vectorizer.transform(df.iloc[:,0])

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(df.iloc[:,0])
char_features = char_vectorizer.transform(df.iloc[:,0])

features = hstack([char_features, word_features])

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(features, df, test_size = 0.2)

In [14]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

models = []
predicted = pd.DataFrame.from_dict({'id': np.arange(len(test_Y))})
for class_name in class_names:
    print(class_name)
    train_target = train_Y[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')
    classifier.fit(train_X, train_target)
    predicted[class_name] = classifier.predict_proba(test_X)[:, 1]
    models.append(classifier)

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [21]:
import pickle
with open('logistics.pkl','wb') as fopen:
    pickle.dump(models, fopen)

In [18]:
from sklearn import metrics
print(metrics.classification_report(test_Y.iloc[:,1:],np.around(predicted.iloc[:,1:].values)))

             precision    recall  f1-score   support

          0       0.98      0.27      0.43       805
          1       0.50      0.02      0.04        88
          2       0.99      0.30      0.46       460
          3       0.00      0.00      0.00        32
          4       0.87      0.22      0.35       420
          5       0.00      0.00      0.00        68

avg / total       0.88      0.24      0.38      1873



  'precision', 'predicted', average, warn_for)


In [22]:
from sklearn.naive_bayes import MultinomialNB
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

models = []
predicted = pd.DataFrame.from_dict({'id': np.arange(len(test_Y))})
for class_name in class_names:
    print(class_name)
    train_target = train_Y[class_name]
    classifier = MultinomialNB()
    classifier.fit(train_X, train_target)
    predicted[class_name] = classifier.predict_proba(test_X)[:, 1]
    models.append(classifier)
    
print(metrics.classification_report(test_Y.iloc[:,1:],np.around(predicted.iloc[:,1:].values)))

toxic
severe_toxic
obscene
threat
insult
identity_hate
             precision    recall  f1-score   support

          0       0.81      0.52      0.63       805
          1       0.44      0.35      0.39        88
          2       0.76      0.49      0.59       460
          3       0.00      0.00      0.00        32
          4       0.68      0.47      0.56       420
          5       0.15      0.09      0.11        68

avg / total       0.71      0.47      0.56      1873



In [23]:
with open('multinomials.pkl','wb') as fopen:
    pickle.dump(models, fopen)

In [19]:
with open('vectorizer.pkl','wb') as fopen:
    pickle.dump({'word':word_vectorizer,'char':char_vectorizer}, fopen)