#### Importación de librerias

In [1]:
import re, datasets, math
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#### Importación del dataset

In [2]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-1d47093687320b66
Reusing dataset parquet (C:\Users\camilo\.cache\huggingface\datasets\ucberkeley-dlab___parquet\ucberkeley-dlab--measuring-hate-speech-1d47093687320b66\0.0.0\7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)
100%|██████████| 1/1 [00:00<00:00, 52.78it/s]


Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


#### funciones de procesamiento

In [3]:
def split_hashtag(token: str) -> str:
    """
    Split hashtags by uppercases
    """
    i, end = len(token) - 2, len(token) + 1
    response = ''
    while i >= 1:
        word = token[i:end]
        if word[0].isupper():
            response = token[i:end] + ' ' + response
            end = i
        i -= 1
    else:
        if end-1 > 0:
            response = token[1:end] + ' ' + response
    return(response.strip())

In [4]:
def expand_contractions(document: str) -> str:
    """
    Replace all abbreviations with their corresponding expansion
    """
    document = re.sub(r"'cause", "because", document)
    document = re.sub(r"o'clock", "of the clock", document)
    document = re.sub(r"won\'t", "will not", document)
    document = re.sub(r"can\'t", "can not", document)
    document = re.sub(r"n\'t", " not", document)
    document = re.sub(r"\'re", " are", document)
    document = re.sub(r"\'s", " is", document)
    document = re.sub(r"\'d", " would", document)
    document = re.sub(r"\'ll", " will", document)
    document = re.sub(r"\'t", " not", document)
    document = re.sub(r"\'ve", " have", document)
    document = re.sub(r"\'m", " am", document)
    return document

In [5]:
def preprocessing(document: str) -> list:
    """
    expand contractions
    replace any @user type with 'user' token
    split hashtags into words by upper case
    text to lower case
    remove new line char
    remove non latin chars
    remove extra blank spaces
    """
    document = expand_contractions(document)
    document = re.sub('@[a-zA-Z0-9_-]{0,150}', 'user', document)
    hashtags = re.findall('#[a-zA-Z0-9_-]{0,150}', document)
    for hashtag in hashtags:
        split = split_hashtag(hashtag)
        document = re.sub(hashtag, split, document)
    document = document.lower()
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

#### Aplicar preprocesamiento

In [6]:
data = pd.DataFrame()
data['text'] = df.text
data['processed'] = df.text.apply(preprocessing)
data['hatespeech'] = df.hatespeech.apply(lambda x: math.ceil(x/2))
data.shape

(135388, 3)

In [7]:
data.sample(10)[['hatespeech','text','processed']]

Unnamed: 0,hatespeech,text,processed
24553,0,@TabindaSulahri1 I'm very sorry we are ashamed...,user i am very sorry we are ashamed muslim we ...
111766,1,There are not any jew signatures on our Declar...,there are not any jew signatures on our declar...
54786,0,swedish cock is my favourite kind of cock ! hehe,swedish cock is my favourite kind of cock hehe
76341,0,David Rook what about all those visa dodgers? ...,david rook what about all those visa dodgers w...
24671,0,"If white genocide means this guy won't exist, ...",if white genocide means this guy will not exis...
89623,0,Every one worships their god on their festival...,every one worships their god on their festival...
104981,1,Fuck you fuck you you stupid cheating bitch go...,fuck you fuck you you stupid cheating bitch go...
55823,0,Good! Deport all of them.,good deport all of them
120442,0,illegal immigrants =/= legal immigrants. in ev...,illegal immigrants legal immigrants in every o...
90141,0,"America is the trashiest country in the world,...",america is the trashiest country in the world ...


In [8]:
y_sample = data['hatespeech'].astype(float)
X_sample = data['processed'].astype(str)

X_train_validation, X_test, y_train_validation, y_test = train_test_split(X_sample.values, 
                                                    y_sample.values, 
                                                    random_state=42,test_size=0.2)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=42,test_size=0.125)

print('Number of rows in the total set: {}'.format(X_sample.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 135388
Number of rows in the training set: 94771
Number of rows in the validation set: 13539
Number of rows in the test set: 27078


In [9]:
#Genera la representación tf del vocabulario
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(X_train)
validation_data_tf = count_vector.transform(X_validation)
testing_data_tf = count_vector.transform(X_test)

training_validation_x_tf = np.concatenate((X_train,X_validation))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((y_train,y_validation))

In [10]:
# se normaliza la representación vectorial
training_data_tf_norm = Normalizer().fit_transform(training_data_tf)
validation_data_tf_norm = Normalizer().fit_transform(validation_data_tf)
testing_data_tf_norm = Normalizer().fit_transform(testing_data_tf)
cross_validation_data_tf_norm = Normalizer().fit_transform(cross_validation_x_tf)

In [11]:
#Genera la representación tfidf del vocabulario
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(X_train)
validation_data_tfidf = count_vector_tfidf.transform(X_validation)
testing_data_tfidf = count_vector_tfidf.transform(X_test)

training_validation_x_tfidf = np.concatenate((X_train,X_validation))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((y_train,y_validation))

In [12]:
# se normaliza la representación vectorial
training_data_tfidf_norm = Normalizer().fit_transform(training_data_tfidf) 
validation_data_tfidf_norm = Normalizer().fit_transform(validation_data_tfidf)
testing_data_tfidf_norm = Normalizer().fit_transform(testing_data_tfidf)
cross_validation_data_tfidf_norm = Normalizer().fit_transform(cross_validation_x_tfidf)

## Naive Bayes

In [35]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tf
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores

{'fit_time': array([0.13962603, 0.12067723, 0.17154098, 0.18650246, 0.17054296,
        0.19148898, 0.11468911, 0.09873581, 0.13164902, 0.10870838]),
 'score_time': array([0.09873724, 0.05984712, 0.09674096, 0.03689909, 0.0678196 ,
        0.05385423, 0.04189706, 0.04587746, 0.05485439, 0.04188633]),
 'test_accuracy': array([0.7850614 , 0.78810821, 0.78903148, 0.77979873, 0.78746191,
        0.79300157, 0.79106269, 0.78561536, 0.780722  , 0.79614071]),
 'test_precision': array([0.78760815, 0.79284316, 0.7865505 , 0.77747025, 0.78649835,
        0.7993311 , 0.79257221, 0.78579326, 0.78532993, 0.8068054 ]),
 'test_recall': array([0.64297106, 0.64601367, 0.65808656, 0.63986333, 0.65284738,
        0.65330296, 0.65626424, 0.6476082 , 0.63166287, 0.65353075]),
 'test_f1': array([0.70797792, 0.71193674, 0.71660672, 0.70198675, 0.71346776,
        0.71897719, 0.71800623, 0.71003996, 0.70016412, 0.72212434])}

In [14]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.7898407626266004
best parameter :  {'alpha': 0.1}


In [36]:
# Evaluación del mejor modelo
naive_bayes_best_1 = MultinomialNB(alpha=0.1)
naive_bayes_best_1.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes_best_1, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores

{'fit_time': array([0.09374809, 0.07380414, 0.08377647, 0.2014606 , 0.13364649,
        0.10571742, 0.08776617, 0.07879496, 0.09175491, 0.07579541]),
 'score_time': array([0.03391027, 0.02193999, 0.06383061, 0.07879257, 0.04088688,
        0.04388213, 0.04089832, 0.02592492, 0.04188848, 0.03889656]),
 'test_accuracy': array([0.78949312, 0.79503278, 0.7924476 , 0.78570769, 0.78838519,
        0.79549441, 0.79494045, 0.79004709, 0.78487674, 0.79549441]),
 'test_precision': array([0.77059276, 0.77749361, 0.77224199, 0.76478116, 0.76570415,
        0.78210117, 0.77574371, 0.76934827, 0.76963351, 0.78283485]),
 'test_recall': array([0.68421053, 0.69248292, 0.69202733, 0.68063781, 0.68861048,
        0.68678815, 0.69498861, 0.68838269, 0.66970387, 0.6856492 ]),
 'test_f1': array([0.72483707, 0.73253012, 0.72993753, 0.72026034, 0.72511394,
        0.73135233, 0.7331491 , 0.72661698, 0.71619976, 0.73102611])}

In [27]:

predictions = naive_bayes_best_1.predict(testing_data_tf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.7887214713051185
Precision score:  0.7732886749005652
Recall score:  0.6751347893630631
F1 score:  0.7208859833146314


In [16]:
pred = naive_bayes_best_1.predict(testing_data_tf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.80      0.87      0.83     16135
         1.0       0.77      0.68      0.72     10943

    accuracy                           0.79     27078
   macro avg       0.79      0.77      0.78     27078
weighted avg       0.79      0.79      0.79     27078



(13969, 2166, 3555, 7388)

In [37]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tfidf

naive_bayes_2 = MultinomialNB()
naive_bayes_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores

{'fit_time': array([0.0817678 , 0.06582189, 0.08477306, 0.12765622, 0.10172582,
        0.08676553, 0.08676696, 0.08377504, 0.08777213, 0.1057179 ]),
 'score_time': array([0.03191566, 0.03390908, 0.03191519, 0.04886889, 0.03490543,
        0.02891922, 0.04488683, 0.03391671, 0.04287767, 0.03490782]),
 'test_accuracy': array([0.7850614 , 0.78810821, 0.78903148, 0.77979873, 0.78746191,
        0.79300157, 0.79106269, 0.78561536, 0.780722  , 0.79614071]),
 'test_precision': array([0.78760815, 0.79284316, 0.7865505 , 0.77747025, 0.78649835,
        0.7993311 , 0.79257221, 0.78579326, 0.78532993, 0.8068054 ]),
 'test_recall': array([0.64297106, 0.64601367, 0.65808656, 0.63986333, 0.65284738,
        0.65330296, 0.65626424, 0.6476082 , 0.63166287, 0.65353075]),
 'test_f1': array([0.70797792, 0.71193674, 0.71660672, 0.70198675, 0.71346776,
        0.71897719, 0.71800623, 0.71003996, 0.70016412, 0.72212434])}

In [18]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tfidf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.7903894500080397
best parameter :  {'alpha': 0.5}


In [38]:
# Evaluación del mejor modelo
naive_bayes_best_2 = MultinomialNB(alpha=0.5)
naive_bayes_best_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes_best_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores

{'fit_time': array([0.10172892, 0.08975339, 0.08776498, 0.11170077, 0.10072923,
        0.17256117, 0.18251252, 0.09674168, 0.16954732, 0.08876181]),
 'score_time': array([0.0369029 , 0.04587817, 0.03590345, 0.03789806, 0.03989291,
        0.09472346, 0.05285811, 0.03789902, 0.05485177, 0.03789806]),
 'test_accuracy': array([0.7877389 , 0.79180131, 0.79152433, 0.78303019, 0.78635398,
        0.79586372, 0.79383252, 0.78727726, 0.78210692, 0.79567907]),
 'test_precision': array([0.78046162, 0.78626978, 0.78096995, 0.77142097, 0.77258403,
        0.79311273, 0.78344284, 0.77519789, 0.77701965, 0.79506641]),
 'test_recall': array([0.6625655 , 0.66788155, 0.67494305, 0.66036446, 0.67015945,
        0.6715262 , 0.67904328, 0.66924829, 0.64851936, 0.66810934]),
 'test_f1': array([0.71669747, 0.72225644, 0.7240958 , 0.71158567, 0.71773603,
        0.72727273, 0.72751678, 0.71833741, 0.7069779 , 0.72607996])}

In [26]:
# Evaluación del modelo

predictions = naive_bayes_best_2.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.7881305857153409
Precision score:  0.7732521520050388
Recall score:  0.6731243717444942
F1 score:  0.719722507206019


In [21]:
pred = naive_bayes_best_2.predict(testing_data_tfidf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.80      0.87      0.83     16135
         1.0       0.77      0.67      0.72     10943

    accuracy                           0.79     27078
   macro avg       0.78      0.77      0.77     27078
weighted avg       0.79      0.79      0.79     27078



(13975, 2160, 3577, 7366)

In [29]:
pickle.dump(naive_bayes_best_1, open('salida/naive_bayes.model', 'wb'))
pickle.dump(count_vector, open('salida/naive_bayes.vector', 'wb'))

In [30]:
def predictor(text):
    count_vector = pickle.load(open('salida/naive_bayes.vector', 'rb'))
    naive_bayes = pickle.load(open('salida/naive_bayes.model', 'rb'))
    data = count_vector.transform([text])
    norm = Normalizer().fit_transform(data)
    return 'Hate Speech' if naive_bayes.predict(norm)[0] == 1 else 'Not Hate Speech'

In [24]:
predictor('everybody will die sooner or later')

'Hate Speech'