# Text classification tools

Links:

* [https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

* [https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/](https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/)

* [https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)


## Initial configurations

In [36]:
import pandas as pd
import numpy as np

# metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score

# feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# multilabel
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain

# classifiers
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


pd.set_option("display.width", 150)


## Preparing dataset

In [16]:
train = pd.read_csv("~/Downloads/toxic-comment/train.csv", index_col=0)
test = pd.read_csv("~/Downloads/toxic-comment/test.csv", index_col=0)

X, y = train.iloc[:, 0], train.iloc[:, 1:]
print("Train", X.shape)


Train (159571,)


In [17]:
vectorizer = TfidfVectorizer(stop_words="english")

X_tfidf = vectorizer.fit_transform(X)
print("Train", X_tfidf.shape)


Train (159571, 189460)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y)

print(X_train.shape, "\n")
print(y_train.head())


(119678, 189460) 

                  toxic  severe_toxic  obscene  threat  insult  identity_hate
id                                                                           
fd38c4eb084bd763      0             0        0       0       0              0
9a5a4eb29c89ad1d      0             0        0       0       0              0
539857e6234a0678      0             0        0       0       0              0
52573c72a52d15cf      0             0        0       0       0              0
6eb207a2f918269d      0             0        0       0       0              0


## Classification and tests

In [None]:
# initialize multi-label classifier

alg = SVC(probability=True)
# alg = GaussianNB()
# alg = MultinomialNB()
# alg = SGDClassifier(loss="log", max_iter=1000, tol=1e-3)

# classifier = BinaryRelevance(alg)
classifier = ClassifierChain(alg)

QTD_train = 2000
QTD_test = 1000

# train
classifier.fit(X_train[:QTD_train], y_train[:QTD_train])


In [30]:
predictions = classifier.predict(X_test[:QTD_test])
print(predictions.size)

58


In [31]:
print("Accuracy", accuracy_score(y_test[:QTD_test], predictions.toarray()))
print("Precision", precision_score(y_test[:QTD_test], predictions.toarray(), average="weighted"))
print("Recall", recall_score(y_test[:QTD_test], predictions.toarray(), average="weighted"))
print("Log loss", log_loss(y_test[:QTD_test], predictions.toarray()))


Accuracy 0.91
Precision 0.864478114478
Recall 0.242424242424
Log loss 0.589424088198


  'precision', 'predicted', average, warn_for)


In [32]:
np.mean(np.equal(y_test[:QTD_test], predictions.toarray()))


toxic            0.924
severe_toxic     0.992
obscene          0.965
threat           0.999
insult           0.967
identity_hate    0.993
dtype: float64

In [33]:
results = cross_val_score(classifier, X_tfidf[:QTD_train], y[:QTD_train], cv=KFold(n_splits=2, shuffle=True))
print("Baseline: %.3f%% (%.3f%%)" % (results.mean(), results.std()))


Baseline: 0.899% (0.004%)


## Submission

In [34]:
X_test_tfidf = vectorizer.transform(test.iloc[:, 0])
print("Test", X_test_tfidf.shape)


Test (153164, 189460)


ClassifierChain(classifier=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
        require_dense=[True, True])

In [35]:
subm = pd.DataFrame(columns=y_train.columns)
subm.index.names = ["id"]

for i in range(1000, test.shape[0]+999, 1000):
    tmp = X_test_tfidf[i-1000:min(i, test.shape[0])]
    predictions = classifier.predict_proba(tmp)
    print(i*100/test.shape[0], "%")

    tmp = pd.DataFrame(predictions.toarray(), columns=y_train.columns)
    tmp = tmp.set_index(test[i-1000:min(i, test.shape[0])].index)
    subm = subm.append(tmp)


AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [83]:
# tmp = X_tfidf[153000:]
# predictions = classifier.predict_proba(tmp)
# print(predictions.shape)

# tmp = pd.DataFrame(predictions.toarray(), columns=y_train.columns).set_index(X_test[153000:].index)
# subm = subm.append(tmp)

print(subm.shape)
subm.head()

(153164, 6)


Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.036834,0.001252,0.019624,0.000901,0.012901,0.005071
0000247867823ef7,0.10359,0.001278,0.062008,0.00092,0.024996,0.005153
00013b17ad220c46,0.067245,0.001296,0.03466,0.000934,0.020561,0.005224
00017563c3f7919a,0.037289,0.001146,0.036835,0.000887,0.008607,0.005713
00017695ad8997eb,0.099386,0.001374,0.074929,0.00088,0.046996,0.004783


In [84]:
subm.to_csv("subm.csv")