In [1]:
import arff
import numpy as np
from operator import attrgetter, itemgetter
from io import StringIO
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support as score, average_precision_score
from sklearn import metrics
## training model
from sklearn.linear_model import SGDClassifier

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

# Importing dataset

In [2]:
data = arff.load(open('../OffComBR3.arff'))
df = pd.DataFrame(data['data'])
df.columns = ['hate', 'sentence']

# transforming 'yes' into 1 and 'no' into 0
df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)

X = df['sentence'].tolist()
y = df['hate'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

In [3]:
cl =  Pipeline([
        ('tfidf',TfidfVectorizer()),
        ('clf', SGDClassifier()),
        ])

In [4]:
cl.fit(X_train, y_train)
pred = cl.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88       267
           1       0.55      0.23      0.32        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.68      0.59      0.60       341
weighted avg       0.76      0.79      0.76       341





In [5]:
nltk.download('stopwords')
nltk.download('rslp')
stopwords = nltk.corpus.stopwords.words('portuguese')
stemmer = nltk.stem.RSLPStemmer()
nltk.download('punkt')


def clean_text(txt):
    text = ''
    for w in nltk.word_tokenize(txt):
        if w not in stopwords:
            text = text + stemmer.stem(w) + ' '
    return text.strip()

[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/marco/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /home/marco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
## Cleaning text before

for i in range(len(X_train)):
    n_txt = clean_text(X_train[i])
    X_train[i] = n_txt
    
for i in range(len(X_test)):
    n_txt = clean_text(X_test[i])
    X_test[i] = n_txt

In [7]:
cl.fit(X_train, y_train)
pred = cl.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.93      0.88       267
           1       0.58      0.35      0.44        74

   micro avg       0.80      0.80      0.80       341
   macro avg       0.71      0.64      0.66       341
weighted avg       0.78      0.80      0.78       341



## Grid tests with dynamic parameters

### Tests with accuracy as scoring paramether

In [16]:
## Teste com accuracy

parameters = {'tfidf__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
              'clf__alpha': (1, 1e-2),
              'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
              'clf__penalty': ('none', 'l2', 'l1','elasticnet')
             }

gs_clf = GridSearchCV(cl, parameters, cv=5, iid=False, n_jobs=-1, scoring='accuracy')
# iidd: "If True, return the average score across folds, weighted by the number of samples"
# cv: Number of folds
# n_jobs: -1 means using all processors
# scoring: 'balanced accuracy is the average of recall obtained on each class.'

gs_clf.fit(X_train, y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.8468221043603075
{'clf__alpha': 0.01, 'clf__loss': 'squared_hinge', 'clf__penalty': 'none', 'tfidf__ngram_range': (1, 4)}




In [21]:
cl_enhanced =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1,4))),
        ('clf', SGDClassifier(alpha=0.01, loss='squared_hinge', penalty='none')),
        ])
cl_enhanced.fit(X_train, y_train)
pred = cl_enhanced.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.81      0.97      0.89       267
           1       0.67      0.19      0.29        74

   micro avg       0.80      0.80      0.80       341
   macro avg       0.74      0.58      0.59       341
weighted avg       0.78      0.80      0.76       341



### Tests with f1_macro as scoring paramether

In [14]:
parameters = {'tfidf__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
              'clf__alpha': (1, 1e-2),
              'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
              'clf__penalty': ('none', 'l2', 'l1','elasticnet')
             }

gs_clf = GridSearchCV(cl, parameters, cv=5, iid=False, n_jobs=-1, scoring='f1_macro')

gs_clf.fit(X_train, y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.6849917499152081
{'clf__alpha': 0.01, 'clf__loss': 'modified_huber', 'clf__penalty': 'none', 'tfidf__ngram_range': (1, 1)}




In [22]:
cl_enhanced =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1,1))),
        ('clf', SGDClassifier(alpha=0.01, loss='modified_huber', penalty='none')),
        ])
cl_enhanced.fit(X_train, y_train)
pred = cl_enhanced.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.86       267
           1       0.48      0.31      0.38        74

   micro avg       0.78      0.78      0.78       341
   macro avg       0.65      0.61      0.62       341
weighted avg       0.75      0.78      0.76       341



### Tests with precision as scoring paramether

In [17]:
parameters = {'tfidf__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
              'clf__alpha': (1, 1e-2),
              'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
              'clf__penalty': ('none', 'l2', 'l1','elasticnet')
             }

gs_clf = GridSearchCV(cl, parameters, cv=5, iid=False, n_jobs=-1, scoring='precision')

gs_clf.fit(X_train, y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

1.0
{'clf__alpha': 0.01, 'clf__loss': 'modified_huber', 'clf__penalty': 'l2', 'tfidf__ngram_range': (1, 1)}




In [23]:
cl_enhanced =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1,1))),
        ('clf', SGDClassifier(alpha=0.01, loss='modified_huber', penalty='l2')),
        ])
cl_enhanced.fit(X_train, y_train)
pred = cl_enhanced.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88       267
           1       0.75      0.04      0.08        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.77      0.52      0.48       341
weighted avg       0.78      0.79      0.71       341



### Tests with recall as scoring paramether

In [18]:
parameters = {'tfidf__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
              'clf__alpha': (1, 1e-2),
              'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
              'clf__penalty': ('none', 'l2', 'l1','elasticnet')
             }

gs_clf = GridSearchCV(cl, parameters, cv=5, iid=False, n_jobs=-1, scoring='recall')

gs_clf.fit(X_train, y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.5393846153846155
{'clf__alpha': 0.01, 'clf__loss': 'perceptron', 'clf__penalty': 'none', 'tfidf__ngram_range': (1, 3)}




In [24]:
cl_enhanced =  Pipeline([
        ('tfidf',TfidfVectorizer(ngram_range=(1,3))),
        ('clf', SGDClassifier(alpha=0.01, loss='perceptron', penalty='none')),
        ])
cl_enhanced.fit(X_train, y_train)
pred = cl_enhanced.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.82      0.84       267
           1       0.43      0.50      0.46        74

   micro avg       0.75      0.75      0.75       341
   macro avg       0.64      0.66      0.65       341
weighted avg       0.76      0.75      0.75       341

