In [9]:
import arff
import numpy as np
from operator import attrgetter, itemgetter
from io import StringIO
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support as score, average_precision_score

## training model
from sklearn.linear_model import SGDClassifier

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

# Importing dataset

In [3]:
data = arff.load(open('../OffComBR3.arff'))
df = pd.DataFrame(data['data'])
df.columns = ['hate', 'sentence']

# transforming 'yes' into 1 and 'no' into 0
df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)

X = df['sentence'].tolist()
y = df['hate'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

In [26]:
cl =  Pipeline([
        ('tfidf',TfidfVectorizer()),
        ('clf', SGDClassifier()),
        ])

In [27]:
cl.fit(X_train, y_train)
pred = cl.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89       267
           1       0.62      0.28      0.39        74

   micro avg       0.81      0.81      0.81       341
   macro avg       0.72      0.62      0.64       341
weighted avg       0.78      0.81      0.78       341



In [36]:
nltk.download('stopwords')
nltk.download('rslp')
stopwords = nltk.corpus.stopwords.words('portuguese')
stemmer = nltk.stem.RSLPStemmer()
nltk.download('punkt')


def clean_text(txt):
    text = ''
    for w in nltk.word_tokenize(txt):
        if w not in stopwords:
            text = text + stemmer.stem(w) + ' '
    return text.strip()

[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/marco/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /home/marco/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [37]:
## Cleaning text before

for i in range(len(X_train)):
    n_txt = clean_text(X_train[i])
    X_train[i] = n_txt
    
for i in range(len(X_test)):
    n_txt = clean_text(X_test[i])
    X_test[i] = n_txt

In [42]:
cl.fit(X_train, y_train)
pred = cl.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89       267
           1       0.68      0.34      0.45        74

   micro avg       0.82      0.82      0.82       341
   macro avg       0.76      0.65      0.67       341
weighted avg       0.80      0.82      0.80       341



## Grid tests with dynamic parameters

In [80]:
parameters = {'tfidf__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
              'clf__alpha': (1, 1e-2),
              'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron')
             }

gs_clf = GridSearchCV(cl, parameters, cv=5, iid=False, n_jobs=-1)
# iidd: "If True, return the average score across folds, weighted by the number of samples"
# cv: Number of folds
# n_jobs: -1 means using all processors

In [81]:
gs_clf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 'clf__alpha': (1, 0.01), 'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [82]:
gs_clf.best_params_

{'clf__alpha': 1, 'clf__loss': 'perceptron', 'tfidf__ngram_range': (1, 3)}

In [83]:
gs_clf.best_score_

0.8353110085717527