In [25]:
import arff
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from operator import attrgetter, itemgetter
from io import StringIO
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, confusion_matrix,classification_report,precision_recall_fscore_support as score, average_precision_score
from sklearn import metrics
from sklearn.model_selection import cross_validate
import pickle

## training model
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [57]:
data = arff.load(open('../OffComBR3.arff'))
df = pd.DataFrame(data['data'])
df.columns = ['hate', 'sentence']

# transforming 'yes' into 1 and 'no' into 0
df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)

X = df['sentence'].tolist()
y = df['hate'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

pt_stop_words = nltk.corpus.stopwords.words('portuguese')
classifiers = []
max_df = 0.6

In [58]:
RandomForest =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1,4),
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=max_df
                                )),
        ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0,)),
        ])
RandomForest.fit(X_train, y_train)
classifiers.append(('randomforest', RandomForest))

In [59]:
MLP = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),                                  
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=max_df)),
        ('clf', MLPClassifier(activation='logistic', alpha=0, solver='lbfgs')),
        ])
MLP.fit(X_train, y_train)
classifiers.append(('mlp', MLP))

In [60]:
SVCl = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),                        
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=max_df)),
        ('clf', SVC(C=4, kernel='linear', probability=True, shrinking=True, tol=1)),
        ])
SVCl.fit(X_train, y_train)
classifiers.append(('svc', SVCl))

In [61]:
multiNB =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1))),
        ('clf', MultinomialNB(alpha=0.1, fit_prior=False)),
        ])
multiNB.fit(X_train, y_train)
classifiers.append(('multinb', multiNB))

In [62]:
SGD = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1))),
        ('clf', SGDClassifier(alpha=0.01, loss='perceptron', penalty='none', max_iter=1000, tol=0.001)),
        ])
SGD.fit(X_train, y_train)
classifiers.append(('sgd', SGD))

In [63]:
DecisionTree = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1))),
        ('clf', DecisionTreeClassifier(class_weight={0: 1, 1: 2}, criterion='gini', min_samples_split=3)),
        ])
DecisionTree.fit(X_train, y_train)
classifiers.append(('decisiontree', DecisionTree))

In [66]:
for classifier in classifiers:
        scores = cross_val_score(classifier[1], X, y, cv=5, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
            % (scores.mean(), scores.std(), classifier[0]))

Accuracy: 0.79 (+/- 0.07) [randomforest]
Accuracy: 0.77 (+/- 0.05) [mlp]
Accuracy: 0.80 (+/- 0.03) [svc]
Accuracy: 0.78 (+/- 0.03) [multinb]
Accuracy: 0.76 (+/- 0.03) [sgd]
Accuracy: 0.71 (+/- 0.08) [decisiontree]


In [72]:
for classifier in classifiers:
    scores = cross_val_score(classifier[1], X, y, cv=5, scoring='recall')
    print("Recall: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), classifier[0]))

Recall: 0.23 (+/- 0.07) [randomforest]
Recall: 0.47 (+/- 0.11) [mlp]
Recall: 0.38 (+/- 0.08) [svc]
Recall: 0.47 (+/- 0.06) [multinb]
Recall: 0.40 (+/- 0.05) [sgd]
Recall: 0.47 (+/- 0.05) [decisiontree]


In [73]:
for classifier in classifiers:
    scores = cross_val_score(classifier[1], X, y, cv=5, scoring='precision')
    print("Precision: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), classifier[0]))

Precision: 0.59 (+/- 0.22) [randomforest]
Precision: 0.46 (+/- 0.11) [mlp]
Precision: 0.49 (+/- 0.09) [svc]
Precision: 0.46 (+/- 0.08) [multinb]
Precision: 0.36 (+/- 0.05) [sgd]
Precision: 0.34 (+/- 0.08) [decisiontree]
