# Text classification tools

Links:

* [https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

* [https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/](https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/)

* [https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)


In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

pd.set_option("display.width", 150)


In [2]:
train = pd.read_csv("~/Downloads/toxic-comment/train.csv", index_col=0)
# test = pd.read_csv("~/Downloads/toxic-comment/test.csv", index_col=0)

X, Y = train.iloc[:, 0], train.iloc[:, 1:]
print(Y.shape)


(95851, 6)


In [3]:
count_vec = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X_counts = count_vec.fit_transform(X)

X_tfidf = tfidf_transformer.fit_transform(X_counts)
X_tfidf.shape


(95851, 139175)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, Y)

print(X_train.shape, "\n")
print(y_train.head())


(71888, 139175) 

              toxic  severe_toxic  obscene  threat  insult  identity_hate
id                                                                       
614788949351      0             0        0       0       0              0
2783896542        0             0        0       0       0              0
893962102529      0             0        0       0       0              0
748513191481      0             0        0       0       0              0
709529004803      0             0        0       0       0              0


In [5]:
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(X_train[:1000], y_train[:1000])


BinaryRelevance(classifier=GaussianNB(priors=None),
        require_dense=[True, True])

In [6]:
# predict
predictions = classifier.predict(X_test[:1000])
print(predictions.size)


126


In [7]:
accuracy_score(y_test[:1000], predictions)


0.86299999999999999

In [8]:
np.mean(np.equal(y_test[:1000], predictions.todense()))

0.95833333333333337