In [46]:
import arff
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from operator import attrgetter, itemgetter
from io import StringIO
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, confusion_matrix,classification_report,precision_recall_fscore_support as score, average_precision_score
from sklearn import metrics
from sklearn.model_selection import cross_validate
import pickle

## training model
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [165]:
data = arff.load(open('../text-blob-pt/OffComBR3.arff'))
df = pd.DataFrame(data['data'])
df.columns = ['hate', 'sentence']

# transforming 'yes' into 1 and 'no' into 0
df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)

X = df['sentence'].tolist()
y = df['hate'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

pt_stop_words = nltk.corpus.stopwords.words('portuguese')

classifiers = []

In [166]:
RandomForest =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1,4),
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=0.1
                                )),
        ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0,)),
        ])

In [167]:
RandomForest.fit(X_train, y_train)
pred = RandomForest.predict(X_test)
classifiers.append(('randomforest', RandomForest))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       267
           1       0.39      0.35      0.37        74

   micro avg       0.74      0.74      0.74       341
   macro avg       0.61      0.60      0.60       341
weighted avg       0.73      0.74      0.73       341



In [168]:
MLP = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),                                  
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=0.1)),
        ('clf', MLPClassifier(activation='logistic', alpha=0, solver='lbfgs')),
        ])

In [169]:
MLP.fit(X_train, y_train)
pred = MLP.predict(X_test)
classifiers.append(('mlp', MLP))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       267
           1       0.50      0.50      0.50        74

   micro avg       0.78      0.78      0.78       341
   macro avg       0.68      0.68      0.68       341
weighted avg       0.78      0.78      0.78       341



In [170]:
SVCl = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),                        
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=0.1)),
        ('clf', SVC(C=4, kernel='linear', probability=True, shrinking=True, tol=1)),
        ])

In [171]:
SVCl.fit(X_train, y_train)
pred = SVCl.predict(X_test)
classifiers.append(('svc', SVCl))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       267
           1       0.53      0.34      0.41        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.68      0.63      0.64       341
weighted avg       0.77      0.79      0.77       341



In [172]:
SGD = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=0.1)),
        ('clf', SGDClassifier(alpha=0.01, loss='perceptron', penalty='none')),
        ])

In [173]:
SGD.fit(X_train, y_train)
pred = SGD.predict(X_test)
classifiers.append(('sgd', SGD))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       267
           1       0.37      0.32      0.35        74

   micro avg       0.73      0.73      0.73       341
   macro avg       0.59      0.59      0.59       341
weighted avg       0.72      0.73      0.73       341





In [174]:
MultiNB = Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1, 1),                                  
                                 lowercase=True,
                                 strip_accents='ascii',
                                 max_df=0.1)),
        ('clf', MultinomialNB(alpha=0.1, fit_prior=False)),
        ])

In [175]:
MultiNB.fit(X_train, y_train)
pred = MultiNB.predict(X_test)
classifiers.append(('multiNB', MultiNB))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       267
           1       0.56      0.43      0.49        74

   micro avg       0.80      0.80      0.80       341
   macro avg       0.71      0.67      0.68       341
weighted avg       0.79      0.80      0.79       341



In [176]:
voting = VotingClassifier(estimators=classifiers[0:3], 
                          voting='soft',
                          weights=[1, 2, 1], 
                          n_jobs=15)

In [177]:
voting.fit(X_train, y_train)
pred = voting.predict(X_test)
report = classification_report(y_test, pred, output_dict=True)

In [178]:
df_report = pd.DataFrame(report).transpose()

In [164]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88       267
           1       0.59      0.50      0.54        74

   micro avg       0.82      0.82      0.82       341
   macro avg       0.73      0.70      0.71       341
weighted avg       0.81      0.82      0.81       341



In [179]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.87      0.86       267
           1       0.49      0.47      0.48        74

   micro avg       0.78      0.78      0.78       341
   macro avg       0.67      0.67      0.67       341
weighted avg       0.78      0.78      0.78       341



In [63]:
# df_report.to_csv('./results/csv/hardsoft/1mnb-1mlp-1rf.csv')
# df_report.to_pickle('./results/sav/hardsoft/1mnb-1mlp-1rf.sav')

In [63]:
classifiers[4]

('multiNB', Pipeline(memory=None,
      steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.5, max_features=None, min_df=1,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ue,
         vocabulary=None)), ('clf', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=False))]))

In [101]:
classifiers[6]

('svc', Pipeline(memory=None,
      steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.5, max_features=None, min_df=1,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...linear', max_iter=-1, probability=True, random_state=None,
   shrinking=True, tol=1, verbose=False))]))