#### Importación de librerias

In [3]:

import xml.etree.ElementTree as ET
import re, os, datasets, string
import pandas as pd
import numpy as np
from gensim.parsing.porter import PorterStemmer 
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  confusion_matrix, classification_report
from sklearn import preprocessing as sk_preprocess


In [4]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-1d47093687320b66
Reusing dataset parquet (C:\Users\camilo\.cache\huggingface\datasets\ucberkeley-dlab___parquet\ucberkeley-dlab--measuring-hate-speech-1d47093687320b66\0.0.0\7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)
100%|██████████| 1/1 [00:00<00:00, 50.12it/s]


Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


In [5]:
def split_hashtag(token: str) -> str:
    i, end = len(token) - 2, len(token) + 1
    response = ''
    while i >= 1:
        word = token[i:end]
        if word[0].isupper():
            response = token[i:end] + ' ' + response
            end = i
        i -= 1
    else:
        if end-1 > 0:
            response = token[1:end] + ' ' + response
    return(response.strip())

In [6]:
def expand_contractions(document: str) -> str:
    """
    Replace all abbreviations with their corresponding expansion
    """
    document = re.sub(r"'cause", "because", document)
    document = re.sub(r"o'clock", "of the clock", document)
    document = re.sub(r"won\'t", "will not", document)
    document = re.sub(r"can\'t", "can not", document)
    document = re.sub(r"n\'t", " not", document)
    document = re.sub(r"\'re", " are", document)
    document = re.sub(r"\'s", " is", document)
    document = re.sub(r"\'d", " would", document)
    document = re.sub(r"\'ll", " will", document)
    document = re.sub(r"\'t", " not", document)
    document = re.sub(r"\'ve", " have", document)
    document = re.sub(r"\'m", " am", document)
    return document

In [7]:
def preprocessing(document: str) -> list:
    """
    expand contractions
    replace any @user type with 'user' token
    split hashtags into words by upper case
    text to lower case
    remove new line char
    remove non latin chars
    remove extra blank spaces
    """
    document = expand_contractions(document)
    document = re.sub('@[a-zA-Z0-9_-]{0,150}', 'user', document)
    hashtags = re.findall('#[a-zA-Z0-9_-]{0,150}', document)
    for hashtag in hashtags:
        split = split_hashtag(hashtag)
        document = re.sub(hashtag, split, document)
    document = document.lower()
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

In [8]:
columns = ['sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score']
data = df[columns]
min_max_scaler = sk_preprocess.MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(data.values)
dataset = pd.DataFrame(data_scaled, columns=columns)
dataset['text'] = df.text
dataset['processed'] = df.text.apply(preprocessing)

In [9]:
dataset.sample(10)[['hatespeech','hate_speech_score','text','processed']]

Unnamed: 0,hatespeech,hate_speech_score,text,processed
52366,0.0,0.278005,I love lesbian weddings.,i love lesbian weddings
67560,0.0,0.580601,Bernie is going to give every American free n-...,bernie is going to give every american free n ...
87262,0.5,0.604508,@benshapiro go rape yourself you cause a lot o...,user go rape yourself you cause a lot of violence
85895,1.0,0.685792,Cough up your cash and beg. On your knees. I ...,cough up your cash and beg on your knees i wan...
124972,0.0,0.409153,oh that's cute. you didn't even read the thing...,oh that is cute you did not even read the thin...
85638,0.0,0.422131,@DrStevePerry Laws changing will do very littl...,user laws changing will do very little getting...
97516,1.0,0.836749,"yeah, heres a clue for the nigger author of th...",yeah heres a clue for the nigger author of thi...
80956,0.0,0.55123,@Mitumba10 He needs to be in a mental institut...,user he needs to be in a mental institution an...
111278,1.0,0.670082,My God look at these BARBARIC PPL. And these p...,my god look at these barbaric ppl and these pi...
78002,0.0,0.202186,In the Apostolic Churches there's monasteries ...,in the apostolic churches there is monasteries...


In [10]:
#dataset[dataset['hatespeech']>0] = 1
index_condition = dataset[ (dataset['hatespeech'] != 0) & (dataset['hatespeech']!= 1)].index
dataset.drop(index_condition,inplace=True)
dataset.shape

(126504, 13)

In [11]:
from sklearn.model_selection import train_test_split

y_sample = dataset['hatespeech'].astype(float)
X_sample = dataset['processed'].astype(str)

X_train_validation, X_test, y_train_validation, y_test = train_test_split(X_sample.values, 
                                                    y_sample.values, 
                                                    random_state=42,test_size=0.2)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=42,test_size=0.125)

print('Number of rows in the total set: {}'.format(X_sample.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 126504
Number of rows in the training set: 88552
Number of rows in the validation set: 12651
Number of rows in the test set: 25301


In [12]:
#Genera la representación tf del vocabulario
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(X_train)
validation_data_tf = count_vector.transform(X_validation)
testing_data_tf = count_vector.transform(X_test)

training_validation_x_tf = np.concatenate((X_train,X_validation))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((y_train,y_validation))

In [13]:
# se normaliza la representación vectorial
training_data_tf_norm = Normalizer().fit_transform(training_data_tf)
validation_data_tf_norm = Normalizer().fit_transform(validation_data_tf)
testing_data_tf_norm = Normalizer().fit_transform(testing_data_tf)
cross_validation_data_tf_norm = Normalizer().fit_transform(cross_validation_x_tf)

In [14]:
#Genera la representación tfidf del vocabulario
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(X_train)
validation_data_tfidf = count_vector_tfidf.transform(X_validation)
testing_data_tfidf = count_vector_tfidf.transform(X_test)

training_validation_x_tfidf = np.concatenate((X_train,X_validation))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((y_train,y_validation))

In [15]:
# se normaliza la representación vectorial
training_data_tfidf_norm = Normalizer().fit_transform(training_data_tfidf) 
validation_data_tfidf_norm = Normalizer().fit_transform(validation_data_tfidf)
testing_data_tfidf_norm = Normalizer().fit_transform(testing_data_tfidf)
cross_validation_data_tfidf_norm = Normalizer().fit_transform(cross_validation_x_tfidf)

## Naive Bayes

In [16]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tf
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.06083679, 0.05485702, 0.04886174, 0.06283188, 0.04488111,
        0.04986572, 0.0458703 , 0.04787278, 0.04488063, 0.04787111]),
 'score_time': array([0.02692819, 0.02193975, 0.01894903, 0.01894808, 0.01894927,
        0.01994586, 0.01995325, 0.02097607, 0.01795125, 0.01795292]),
 'test_accuracy': array([0.81918783, 0.8164213 , 0.8118763 , 0.81897233, 0.80988142,
        0.81363636, 0.8201581 , 0.81462451, 0.81778656, 0.81630435]),
 'test_precision_macro': array([0.81851758, 0.81745259, 0.80966809, 0.81695564, 0.80885672,
        0.8133521 , 0.82054684, 0.81099185, 0.81635947, 0.81598778]),
 'test_recall_macro': array([0.78241843, 0.77738894, 0.77445981, 0.78342231, 0.77080236,
        0.77497606, 0.78268007, 0.77938349, 0.78128392, 0.77836985]),
 'test_f1_macro': array([0.79422695, 0.78991522, 0.78587252, 0.79466359, 0.78272094,
        0.78708028, 0.79489006, 0.79008699, 0.79283718, 0.79041189])}

In [17]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.8200266701260188
best parameter :  {'alpha': 0.01}


In [24]:
# Evaluación del mejor modelo
naive_bayes_best_1 = MultinomialNB(alpha=0.01)
naive_bayes_best_1.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes_best_1, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores



{'fit_time': array([0.08377743, 0.125664  , 0.07878304, 0.08278489, 0.1007297 ,
        0.09973335, 0.09574342, 0.07081056, 0.06383252, 0.06582594]),
 'score_time': array([0.03889608, 0.03590512, 0.03291106, 0.0428791 , 0.04986668,
        0.05884123, 0.02892256, 0.03590727, 0.03191113, 0.02892303]),
 'test_accuracy': array([0.8208675 , 0.82373283, 0.81839739, 0.81798419, 0.81551383,
        0.82262846, 0.8284585 , 0.81996047, 0.8208498 , 0.82282609]),
 'test_precision_macro': array([0.81142681, 0.81471724, 0.80858858, 0.80693554, 0.80550247,
        0.8131825 , 0.81928692, 0.80892164, 0.81070735, 0.81349899]),
 'test_recall_macro': array([0.79475486, 0.79782179, 0.79211498, 0.79371901, 0.78868954,
        0.79707611, 0.8041317 , 0.79623187, 0.79582218, 0.79708286]),
 'test_f1_macro': array([0.80146473, 0.80462704, 0.79874085, 0.79924341, 0.79540052,
        0.80361347, 0.81038999, 0.80158144, 0.80193978, 0.80372524])}

In [19]:
pred = naive_bayes_best_1.predict(testing_data_tf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.82      0.92      0.87     16129
         1.0       0.82      0.64      0.72      9172

    accuracy                           0.82     25301
   macro avg       0.82      0.78      0.79     25301
weighted avg       0.82      0.82      0.81     25301



(14824, 1305, 3321, 5851)

In [20]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tfidf

naive_bayes_2 = MultinomialNB()
naive_bayes_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.05086827, 0.08876204, 0.10372305, 0.08477235, 0.05086303,
        0.04288507, 0.0478723 , 0.04886937, 0.04488015, 0.04388213]),
 'score_time': array([0.02692199, 0.03789854, 0.03690124, 0.03490925, 0.01795149,
        0.01795173, 0.03091788, 0.01894879, 0.01994658, 0.0189476 ]),
 'test_accuracy': array([0.81918783, 0.8164213 , 0.8118763 , 0.81897233, 0.80988142,
        0.81363636, 0.8201581 , 0.81462451, 0.81778656, 0.81630435]),
 'test_precision_macro': array([0.81851758, 0.81745259, 0.80966809, 0.81695564, 0.80885672,
        0.8133521 , 0.82054684, 0.81099185, 0.81635947, 0.81598778]),
 'test_recall_macro': array([0.78241843, 0.77738894, 0.77445981, 0.78342231, 0.77080236,
        0.77497606, 0.78268007, 0.77938349, 0.78128392, 0.77836985]),
 'test_f1_macro': array([0.79422695, 0.78991522, 0.78587252, 0.79466359, 0.78272094,
        0.78708028, 0.79489006, 0.79008699, 0.79283718, 0.79041189])}

In [21]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tfidf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.8194168553704386
best parameter :  {'alpha': 0.1}


In [25]:
# Evaluación del mejor modelo
naive_bayes_best_2 = MultinomialNB(alpha=0.01)
naive_bayes_best_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes_best_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores



{'fit_time': array([0.07081127, 0.05984163, 0.08178067, 0.09674406, 0.07679486,
        0.04587746, 0.0459094 , 0.04291797, 0.04188895, 0.04192305]),
 'score_time': array([0.02692842, 0.01894832, 0.04188824, 0.02991867, 0.02393556,
        0.0209434 , 0.01795149, 0.01795244, 0.02293992, 0.01794887]),
 'test_accuracy': array([0.8208675 , 0.82373283, 0.81839739, 0.81798419, 0.81551383,
        0.82262846, 0.8284585 , 0.81996047, 0.8208498 , 0.82282609]),
 'test_precision_macro': array([0.81142681, 0.81471724, 0.80858858, 0.80693554, 0.80550247,
        0.8131825 , 0.81928692, 0.80892164, 0.81070735, 0.81349899]),
 'test_recall_macro': array([0.79475486, 0.79782179, 0.79211498, 0.79371901, 0.78868954,
        0.79707611, 0.8041317 , 0.79623187, 0.79582218, 0.79708286]),
 'test_f1_macro': array([0.80146473, 0.80462704, 0.79874085, 0.79924341, 0.79540052,
        0.80361347, 0.81038999, 0.80158144, 0.80193978, 0.80372524])}

In [26]:
# Evaluación del modelo

predictions = naive_bayes_best_2.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Macro-Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Macro-Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('Macro-F1 score: ', format(f1_score(y_test, predictions,average='macro')))

print('Micro-Precision score: ', format(precision_score(y_test, predictions,average='micro')))
print('Micro-Recall score: ', format(recall_score(y_test, predictions,average='micro')))
print('Micro-F1 score: ', format(f1_score(y_test, predictions,average='micro')))

Accuracy score:  0.8220228449468401
Macro-Precision score:  0.810810352340644
Macro-Recall score:  0.798612781700051
Macro-F1 score:  0.8038038755119001
Micro-Precision score:  0.8220228449468401
Micro-Recall score:  0.8220228449468401
Micro-F1 score:  0.8220228449468401


In [27]:
pred = naive_bayes_best_2.predict(testing_data_tfidf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.84      0.88      0.86     16129
         1.0       0.78      0.71      0.74      9172

    accuracy                           0.82     25301
   macro avg       0.81      0.80      0.80     25301
weighted avg       0.82      0.82      0.82     25301



(14254, 1875, 2628, 6544)