In [17]:
from textblob import TextBlob
import arff
from io import StringIO
from sklearn.model_selection import train_test_split
from textblob.classifiers import NaiveBayesClassifier, DecisionTreeClassifier, MaxEntClassifier
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [18]:
data = arff.load(open('OffComBR3.arff'))
df = pd.DataFrame(data['data'])
df.columns = ['hate', 'sentence']

In [19]:
df.head()

Unnamed: 0,hate,sentence
0,yes,Votaram no PEZAO Agora tomem no CZAO
1,no,cuidado com a poupanca pessoal Lembram o que a...
2,no,Sabe o que eu acho engracado os nossos governa...
3,no,Podiam retirar dos lucros dos bancos
4,no,CADE O GALVAO PRA NARRAR AGORA FALIIIIUUUUUU...


In [20]:
# transforming 'yes' into 1 and 'no' into 0
df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)

In [21]:
# X = df.drop(['hate'], axis=1)
# y = df['hate']
# X_train, X_test, y_train, y_test = train_test_split(
#                                     X, y, test_size=0.33, random_state=42)

In [22]:
nltk.download('stopwords')
nltk.download('rslp')
stopwords = nltk.corpus.stopwords.words('portuguese')
stemmer = nltk.stem.RSLPStemmer()


def clean_text(txt):
    text = ''
    for w in nltk.word_tokenize(txt):
        if w not in stopwords:
            text = text + stemmer.stem(w) + ' '
    return text.strip()

[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/marco/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


 - Reference:  http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

## Separating randomly Traing and Test subsets

In [23]:
X = df['sentence'].tolist()
y = df['hate'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

## Creating the classifiers

In [24]:
## Classifier with MultinominalNaiveBayers

txt_cl = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])
txt_cl.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [25]:
## Classifier with GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier

txt_cl_gb = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',GradientBoostingClassifier()),
])
txt_cl_gb.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

### testing created classifiers with pure dataset

In [26]:
pred = txt_cl.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88       267
           1       0.67      0.03      0.05        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.73      0.51      0.47       341
weighted avg       0.76      0.79      0.70       341



In [27]:
pred2 = txt_cl_gb.predict(X_test)
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           0       0.82      0.96      0.88       267
           1       0.62      0.22      0.32        74

   micro avg       0.80      0.80      0.80       341
   macro avg       0.72      0.59      0.60       341
weighted avg       0.77      0.80      0.76       341



### testing created classifiers with clean dataset

In [33]:
## Pre-processing dataset (cleaning X_train and X_test) 

for i in range(len(X_train)):
    n_txt = clean_text(X_train[i])
    X_train[i] = n_txt
    
for i in range(len(X_test)):
    n_txt = clean_text(X_test[i])
    X_test[i] = n_txt

In [34]:
pred3 = txt_cl.predict(X_test)
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88       267
           1       1.00      0.01      0.03        74

   micro avg       0.79      0.79      0.79       341
   macro avg       0.89      0.51      0.45       341
weighted avg       0.83      0.79      0.69       341



In [35]:
pred4 = txt_cl_gb.predict(X_test)
print(classification_report(y_test, pred4))

              precision    recall  f1-score   support

           0       0.78      0.94      0.85       267
           1       0.23      0.07      0.10        74

   micro avg       0.75      0.75      0.75       341
   macro avg       0.51      0.50      0.48       341
weighted avg       0.66      0.75      0.69       341

