#### Importación de librerias

In [47]:
import re, datasets, math
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#### Importación del dataset

In [2]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-1d47093687320b66
Reusing dataset parquet (C:\Users\Usuario\.cache\huggingface\datasets\ucberkeley-dlab___parquet\ucberkeley-dlab--measuring-hate-speech-1d47093687320b66\0.0.0\7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


#### funciones de procesamiento

In [3]:
def split_hashtag(token: str) -> str:
    """
    Split hashtags by uppercases
    """
    i, end = len(token) - 2, len(token) + 1
    response = ''
    while i >= 1:
        word = token[i:end]
        if word[0].isupper():
            response = token[i:end] + ' ' + response
            end = i
        i -= 1
    else:
        if end-1 > 0:
            response = token[1:end] + ' ' + response
    return(response.strip())

In [4]:
def expand_contractions(document: str) -> str:
    """
    Replace all abbreviations with their corresponding expansion
    """
    document = re.sub(r"'cause", "because", document)
    document = re.sub(r"o'clock", "of the clock", document)
    document = re.sub(r"won\'t", "will not", document)
    document = re.sub(r"can\'t", "can not", document)
    document = re.sub(r"n\'t", " not", document)
    document = re.sub(r"\'re", " are", document)
    document = re.sub(r"\'s", " is", document)
    document = re.sub(r"\'d", " would", document)
    document = re.sub(r"\'ll", " will", document)
    document = re.sub(r"\'t", " not", document)
    document = re.sub(r"\'ve", " have", document)
    document = re.sub(r"\'m", " am", document)
    return document

In [5]:
def preprocessing(document: str) -> list:
    """
    expand contractions
    replace any @user type with 'user' token
    split hashtags into words by upper case
    text to lower case
    remove new line char
    remove non latin chars
    remove extra blank spaces
    """
    document = expand_contractions(document)
    document = re.sub('@[a-zA-Z0-9_-]{0,150}', 'user', document)
    hashtags = re.findall('#[a-zA-Z0-9_-]{0,150}', document)
    for hashtag in hashtags:
        split = split_hashtag(hashtag)
        document = re.sub(hashtag, split, document)
    document = document.lower()
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

#### Aplicar preprocesamiento

In [6]:
data = pd.DataFrame()
data['text'] = df.text
data['processed'] = df.text.apply(preprocessing)
data['hatespeech'] = df.hatespeech.apply(lambda x: math.ceil(x/2))
data.shape

(135388, 3)

In [7]:
data.sample(10)[['hatespeech','text','processed']]

Unnamed: 0,hatespeech,text,processed
38636,0,RT rdunbar83: RT yorkshiremesmac: Our comms co...,rt rdunbar rt yorkshiremesmac our comms coordi...
129290,0,I've always been proud of the Igbo tribe and o...,i have always been proud of the igbo tribe and...
35781,0,Shut the f*%k up with the religious crap. Naus...,shut the f k up with the religious crap nausea...
94019,1,i hate jews so much i wish the holocaust actua...,i hate jews so much i wish the holocaust actua...
92571,1,i hate jews so much i wish the holocaust actua...,i hate jews so much i wish the holocaust actua...
13758,0,i thought that lady had a very hairy chest bec...,i thought that lady had a very hairy chest bec...
29532,0,@4MeSheWILL You can tell be the way she used E...,user you can tell be the way she used english ...
125252,0,There is plenty of evidence that women and non...,there is plenty of evidence that women and non...
101861,1,Every slant in #LosAngeles should be deported....,every slant in los angeles should be deported ...
54218,0,Little boys smh URL,little boys smh url


In [8]:
y_sample = data['hatespeech'].astype(float)
X_sample = data['processed'].astype(str)

X_train_validation, X_test, y_train_validation, y_test = train_test_split(X_sample.values, 
                                                    y_sample.values, 
                                                    random_state=42,test_size=0.2)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=42,test_size=0.125)

print('Number of rows in the total set: {}'.format(X_sample.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 135388
Number of rows in the training set: 94771
Number of rows in the validation set: 13539
Number of rows in the test set: 27078


In [9]:
#Genera la representación tf del vocabulario
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(X_train)
validation_data_tf = count_vector.transform(X_validation)
testing_data_tf = count_vector.transform(X_test)

training_validation_x_tf = np.concatenate((X_train,X_validation))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((y_train,y_validation))

In [10]:
# se normaliza la representación vectorial
training_data_tf_norm = Normalizer().fit_transform(training_data_tf)
validation_data_tf_norm = Normalizer().fit_transform(validation_data_tf)
testing_data_tf_norm = Normalizer().fit_transform(testing_data_tf)
cross_validation_data_tf_norm = Normalizer().fit_transform(cross_validation_x_tf)

In [11]:
#Genera la representación tfidf del vocabulario
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(X_train)
validation_data_tfidf = count_vector_tfidf.transform(X_validation)
testing_data_tfidf = count_vector_tfidf.transform(X_test)

training_validation_x_tfidf = np.concatenate((X_train,X_validation))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((y_train,y_validation))

In [12]:
# se normaliza la representación vectorial
training_data_tfidf_norm = Normalizer().fit_transform(training_data_tfidf) 
validation_data_tfidf_norm = Normalizer().fit_transform(validation_data_tfidf)
testing_data_tfidf_norm = Normalizer().fit_transform(testing_data_tfidf)
cross_validation_data_tfidf_norm = Normalizer().fit_transform(cross_validation_x_tfidf)

## Naive Bayes

In [13]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tf
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.05865812, 0.06015301, 0.05986738, 0.05987811, 0.06423068,
        0.06049871, 0.05900192, 0.06279445, 0.05665112, 0.0573132 ]),
 'score_time': array([0.02625418, 0.03969049, 0.0245204 , 0.04143858, 0.02700233,
        0.04408097, 0.03174829, 0.02500081, 0.02318835, 0.0410881 ]),
 'test_accuracy': array([0.7850614 , 0.78810821, 0.78903148, 0.77979873, 0.78746191,
        0.79300157, 0.79106269, 0.78561536, 0.780722  , 0.79614071]),
 'test_precision_macro': array([0.78570529, 0.78930827, 0.78842753, 0.77921725, 0.78722441,
        0.79459859, 0.79143619, 0.78565969, 0.78191154, 0.79886662]),
 'test_recall_macro': array([0.76242002, 0.76548471, 0.76818317, 0.75751899, 0.76602934,
        0.77075954, 0.76960083, 0.76364263, 0.75698964, 0.77343515]),
 'test_f1_macro': array([0.7689634 , 0.77217867, 0.77428983, 0.7636883 , 0.77227541,
        0.7775681 , 0.77603032, 0.76998995, 0.76366186, 0.78057223])}

In [14]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.7898407626266004
best parameter :  {'alpha': 0.1}


In [15]:
# Evaluación del mejor modelo
naive_bayes_best_1 = MultinomialNB(alpha=0.01)
naive_bayes_best_1.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes_best_1, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.05198264, 0.05609059, 0.05823493, 0.04116297, 0.06541467,
        0.07554078, 0.05039787, 0.05966139, 0.05334234, 0.05624938]),
 'score_time': array([0.02574801, 0.02860379, 0.03216743, 0.02490067, 0.04143429,
        0.0450387 , 0.03273392, 0.02899003, 0.02650571, 0.03732538]),
 'test_accuracy': array([0.78884683, 0.79604838, 0.79161666, 0.78561536, 0.78903148,
        0.79798726, 0.79447881, 0.79115502, 0.78413812, 0.79484812]),
 'test_precision_macro': array([0.78395036, 0.79163292, 0.78644888, 0.78031415, 0.78350643,
        0.79370959, 0.78946526, 0.78603009, 0.77983677, 0.79115298]),
 'test_recall_macro': array([0.7730812 , 0.78071983, 0.77677609, 0.77002572, 0.77438491,
        0.78274895, 0.77983537, 0.77617035, 0.76678896, 0.7783325 ]),
 'test_f1_macro': array([0.7770842 , 0.7847831 , 0.78045638, 0.77384429, 0.77788812,
        0.78683858, 0.78351925, 0.77990234, 0.77131245, 0.78289906])}

In [16]:
pred = naive_bayes_best_1.predict(testing_data_tf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.80      0.86      0.83     16135
         1.0       0.77      0.68      0.72     10943

    accuracy                           0.79     27078
   macro avg       0.78      0.77      0.78     27078
weighted avg       0.79      0.79      0.79     27078



(13879, 2256, 3493, 7450)

In [17]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tfidf

naive_bayes_2 = MultinomialNB()
naive_bayes_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.09742689, 0.06851053, 0.06752038, 0.05949926, 0.04767346,
        0.0578413 , 0.05810523, 0.05245495, 0.06115651, 0.05664873]),
 'score_time': array([0.04839563, 0.04326344, 0.03900027, 0.03308201, 0.02533412,
        0.02209115, 0.02501178, 0.02813196, 0.02402949, 0.0299685 ]),
 'test_accuracy': array([0.7850614 , 0.78810821, 0.78903148, 0.77979873, 0.78746191,
        0.79300157, 0.79106269, 0.78561536, 0.780722  , 0.79614071]),
 'test_precision_macro': array([0.78570529, 0.78930827, 0.78842753, 0.77921725, 0.78722441,
        0.79459859, 0.79143619, 0.78565969, 0.78191154, 0.79886662]),
 'test_recall_macro': array([0.76242002, 0.76548471, 0.76818317, 0.75751899, 0.76602934,
        0.77075954, 0.76960083, 0.76364263, 0.75698964, 0.77343515]),
 'test_f1_macro': array([0.7689634 , 0.77217867, 0.77428983, 0.7636883 , 0.77227541,
        0.7775681 , 0.77603032, 0.76998995, 0.76366186, 0.78057223])}

In [18]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tfidf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.7903894500080397
best parameter :  {'alpha': 0.5}


In [19]:
# Evaluación del mejor modelo
naive_bayes_best_2 = MultinomialNB(alpha=0.01)
naive_bayes_best_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes_best_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.04700851, 0.04215217, 0.04918575, 0.04835224, 0.05004168,
        0.05710936, 0.06244302, 0.07300353, 0.08160329, 0.04799962]),
 'score_time': array([0.02149653, 0.02297139, 0.01748347, 0.02461386, 0.02482271,
        0.0239594 , 0.02918005, 0.0509398 , 0.02957106, 0.02845502]),
 'test_accuracy': array([0.78884683, 0.79604838, 0.79161666, 0.78561536, 0.78903148,
        0.79798726, 0.79447881, 0.79115502, 0.78413812, 0.79484812]),
 'test_precision_macro': array([0.78395036, 0.79163292, 0.78644888, 0.78031415, 0.78350643,
        0.79370959, 0.78946526, 0.78603009, 0.77983677, 0.79115298]),
 'test_recall_macro': array([0.7730812 , 0.78071983, 0.77677609, 0.77002572, 0.77438491,
        0.78274895, 0.77983537, 0.77617035, 0.76678896, 0.7783325 ]),
 'test_f1_macro': array([0.7770842 , 0.7847831 , 0.78045638, 0.77384429, 0.77788812,
        0.78683858, 0.78351925, 0.77990234, 0.77131245, 0.78289906])}

In [20]:
# Evaluación del modelo

predictions = naive_bayes_best_2.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Macro-Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Macro-Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('Macro-F1 score: ', format(f1_score(y_test, predictions,average='macro')))

print('Micro-Precision score: ', format(precision_score(y_test, predictions,average='micro')))
print('Micro-Recall score: ', format(recall_score(y_test, predictions,average='micro')))
print('Micro-F1 score: ', format(f1_score(y_test, predictions,average='micro')))

Accuracy score:  0.7855454612600635
Macro-Precision score:  0.779093541086759
Macro-Recall score:  0.7713980982843713
Macro-F1 score:  0.774460716018839
Micro-Precision score:  0.7855454612600635
Micro-Recall score:  0.7855454612600635
Micro-F1 score:  0.7855454612600634


In [21]:
pred = naive_bayes_best_2.predict(testing_data_tfidf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.80      0.85      0.82     16135
         1.0       0.75      0.70      0.72     10943

    accuracy                           0.79     27078
   macro avg       0.78      0.77      0.77     27078
weighted avg       0.78      0.79      0.78     27078



(13637, 2498, 3309, 7634)

In [52]:
pickle.dump(naive_bayes_best_2, open('salida/naive_bayes.model', 'wb'))
pickle.dump(count_vector_tfidf, open('salida/naive_bayes.vector', 'wb'))

In [53]:
def predictor(text):
    count_vector = pickle.load(open('salida/naive_bayes.vector', 'rb'))
    naive_bayes = pickle.load(open('salida/naive_bayes.model', 'rb'))
    data = count_vector.transform([text])
    norm = Normalizer().fit_transform(data)
    return 'Hate Speech' if naive_bayes.predict(norm)[0] == 1 else 'Not Hate Speech'

In [54]:
predictor('everybody will die sooner or later')

'Hate Speech'