In [36]:
import arff
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from operator import attrgetter, itemgetter
from io import StringIO
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, confusion_matrix,classification_report,precision_recall_fscore_support as score, average_precision_score
from sklearn import metrics
from sklearn.model_selection import cross_validate
import pickle

## training model
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [37]:
data = arff.load(open('../text-blob-pt/OffComBR3.arff'))
df = pd.DataFrame(data['data'])
df.columns = ['hate', 'sentence']

# transforming 'yes' into 1 and 'no' into 0
df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)

X = df['sentence'].tolist()
y = df['hate'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

pt_stop_words = nltk.corpus.stopwords.words('portuguese')

classifiers = []
max_df = 0.6

In [38]:
RandomForest =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1,4),
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=max_df
                                )),
        ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0,)),
        ])
RandomForest.fit(X_train, y_train)
pred = RandomForest.predict(X_test)
classifiers.append(('randomforest', RandomForest))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       267
           1       0.57      0.27      0.37        74

   micro avg       0.80      0.80      0.80       341
   macro avg       0.70      0.61      0.62       341
weighted avg       0.77      0.80      0.77       341



In [39]:
MLP = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),                                  
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=max_df)),
        ('clf', MLPClassifier(activation='logistic', alpha=0, solver='lbfgs')),
        ])
MLP.fit(X_train, y_train)
pred = MLP.predict(X_test)
classifiers.append(('mlp', MLP))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87       267
           1       0.52      0.46      0.49        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.68      0.67      0.68       341
weighted avg       0.78      0.79      0.78       341



In [40]:
SVCl = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),                        
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=max_df)),
        ('clf', SVC(C=4, kernel='linear', probability=True, shrinking=True, tol=1)),
        ])
SVCl.fit(X_train, y_train)
pred = SVCl.predict(X_test)
classifiers.append(('svc', SVCl))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       267
           1       0.62      0.34      0.44        74

   micro avg       0.81      0.81      0.81       341
   macro avg       0.73      0.64      0.66       341
weighted avg       0.79      0.81      0.79       341



In [17]:
multiNB =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1))),
        ('clf', MultinomialNB(alpha=0.1, fit_prior=False)),
        ])
multiNB.fit(X_train,print(classification_report(y_test, pred)) y_train)
pred = multiNB.predict(X_test)
classifiers.append(('multinb', multiNB))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       267
           1       0.57      0.42      0.48        74

   micro avg       0.81      0.81      0.81       341
   macro avg       0.71      0.67      0.68       341
weighted avg       0.79      0.81      0.79       341



In [71]:
voting = VotingClassifier(estimators=classifiers, 
                          voting='hard',
                          weights=[1, 1, 3], 
                          n_jobs=15)

In [65]:
voting.fit(X_train, y_train)
pred = voting.predict(X_test)
report = classification_report(y_test, pred, output_dict=True)

In [57]:
df_report = pd.DataFrame(report).transpose()

# HARD OR SOFT

In [42]:
# HARD, 0.6 max_df, peso 1 1 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       267
           1       0.62      0.41      0.49        74

   micro avg       0.82      0.82      0.82       341
   macro avg       0.74      0.67      0.69       341
weighted avg       0.80      0.82      0.80       341



In [46]:
# SOFT, 0.6 max_df, peso 1 1 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       267
           1       0.61      0.45      0.52        74

   micro avg       0.82      0.82      0.82       341
   macro avg       0.73      0.68      0.70       341
weighted avg       0.80      0.82      0.81       341



## WEIGHTS

In [50]:
# SOFT, 0.6 max_df, peso 1 2 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87       267
           1       0.52      0.43      0.47        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.69      0.66      0.67       341
weighted avg       0.78      0.79      0.78       341



In [54]:
# SOFT, 0.6 max_df, peso 2 1 2
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       267
           1       0.62      0.35      0.45        74

   micro avg       0.81      0.81      0.81       341
   macro avg       0.73      0.65      0.67       341
weighted avg       0.79      0.81      0.79       341



In [58]:
# SOFT, 0.6 max_df, peso 1 3 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       267
           1       0.53      0.47      0.50        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.69      0.68      0.69       341
weighted avg       0.79      0.79      0.79       341



## Best result so far

In [59]:
# SOFT, 0.6 max_df, peso 1 3 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       267
           1       0.53      0.47      0.50        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.69      0.68      0.69       341
weighted avg       0.79      0.79      0.79       341



## Accuracy, recall, precision test
##### SOFT, 0.6 max_df, peso 1 3 1

In [66]:
scores = cross_val_score(voting, X, y, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='recall')
print("Recall: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='precision')
print("Precision: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))

Accuracy: 0.78 (+/- 0.05) [voting]
Recall: 0.47 (+/- 0.10) [voting]
Precision: 0.44 (+/- 0.09) [voting]


##### HARD, 0.6 max_df, peso 1 3 1

In [68]:
scores = cross_val_score(voting, X, y, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='recall')
print("Recall: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='precision')
print("Precision: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))

Accuracy: 0.75 (+/- 0.04) [voting]
Recall: 0.47 (+/- 0.09) [voting]
Precision: 0.44 (+/- 0.09) [voting]


##### SOFT, 0.6 max_df, peso 1 1 3

In [70]:
scores = cross_val_score(voting, X, y, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='recall')
print("Recall: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='precision')
print("Precision: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))

Accuracy: 0.81 (+/- 0.04) [voting]
Recall: 0.35 (+/- 0.08) [voting]
Precision: 0.54 (+/- 0.15) [voting]


##### HARD, 0.6 max_df, peso 1 1 3

In [72]:
scores = cross_val_score(voting, X, y, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='recall')
print("Recall: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))
scores = cross_val_score(voting, X, y, cv=5, scoring='precision')
print("Precision: %0.2f (+/- %0.2f) [%s]" 
        % (scores.mean(), scores.std(), 'voting'))

Accuracy: 0.80 (+/- 0.03) [voting]
Recall: 0.38 (+/- 0.08) [voting]
Precision: 0.49 (+/- 0.09) [voting]


# Replacing RF with MultiNB


In [34]:
voting = VotingClassifier(estimators=classifiers, 
                          voting='soft',
                          weights=[1, 1, 3], 
                          n_jobs=15)
voting.fit(X_train, y_train)
pred = voting.predict(X_test)
report = classification_report(y_test, pred, output_dict=True)

# HARD OR SOFT

In [23]:
# HARD, 0.6 max_df, peso 1 1 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       267
           1       0.62      0.39      0.48        74

   micro avg       0.82      0.82      0.82       341
   macro avg       0.73      0.66      0.68       341
weighted avg       0.80      0.82      0.80       341



In [26]:
# SOFT, 0.6 max_df, peso 1 1 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88       267
           1       0.55      0.43      0.48        74

   micro avg       0.80      0.80      0.80       341
   macro avg       0.70      0.67      0.68       341
weighted avg       0.79      0.80      0.79       341



## WEIGHTS

In [29]:
# SOFT, 0.6 max_df, peso 2 1 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.91      0.88       267
           1       0.57      0.45      0.50        74

   micro avg       0.81      0.81      0.81       341
   macro avg       0.71      0.68      0.69       341
weighted avg       0.79      0.81      0.80       341



In [31]:
# SOFT, 0.6 max_df, peso 1 2 2
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       267
           1       0.60      0.39      0.48        74

   micro avg       0.81      0.81      0.81       341
   macro avg       0.73      0.66      0.68       341
weighted avg       0.79      0.81      0.80       341



In [33]:
# SOFT, 0.6 max_df, peso 3 1 1
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87       267
           1       0.53      0.46      0.49        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.69      0.67      0.68       341
weighted avg       0.79      0.79      0.79       341



In [35]:
# SOFT, 0.6 max_df, peso 1 1 3
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       267
           1       0.59      0.43      0.50        74

   micro avg       0.81      0.81      0.81       341
   macro avg       0.72      0.68      0.69       341
weighted avg       0.80      0.81      0.80       341

