#### Importación de librerias

In [13]:
import re, datasets, math
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#### Importación del dataset

In [2]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-1d47093687320b66
Reusing dataset parquet (C:\Users\camilo\.cache\huggingface\datasets\ucberkeley-dlab___parquet\ucberkeley-dlab--measuring-hate-speech-1d47093687320b66\0.0.0\7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)
100%|██████████| 1/1 [00:00<00:00, 47.76it/s]


Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


#### funciones de procesamiento

In [3]:
def split_hashtag(token: str) -> str:
    """
    Split hashtags by uppercases
    """
    i, end = len(token) - 2, len(token) + 1
    response = ''
    while i >= 1:
        word = token[i:end]
        if word[0].isupper():
            response = token[i:end] + ' ' + response
            end = i
        i -= 1
    else:
        if end-1 > 0:
            response = token[1:end] + ' ' + response
    return(response.strip())

In [4]:
def expand_contractions(document: str) -> str:
    """
    Replace all abbreviations with their corresponding expansion
    """
    document = re.sub(r"'cause", "because", document)
    document = re.sub(r"o'clock", "of the clock", document)
    document = re.sub(r"won\'t", "will not", document)
    document = re.sub(r"can\'t", "can not", document)
    document = re.sub(r"n\'t", " not", document)
    document = re.sub(r"\'re", " are", document)
    document = re.sub(r"\'s", " is", document)
    document = re.sub(r"\'d", " would", document)
    document = re.sub(r"\'ll", " will", document)
    document = re.sub(r"\'t", " not", document)
    document = re.sub(r"\'ve", " have", document)
    document = re.sub(r"\'m", " am", document)
    return document

In [5]:
def preprocessing(document: str) -> list:
    """
    expand contractions
    replace any @user type with 'user' token
    split hashtags into words by upper case
    text to lower case
    remove new line char
    remove non latin chars
    remove extra blank spaces
    """
    document = expand_contractions(document)
    document = re.sub('@[a-zA-Z0-9_-]{0,150}', 'user', document)
    hashtags = re.findall('#[a-zA-Z0-9_-]{0,150}', document)
    for hashtag in hashtags:
        split = split_hashtag(hashtag)
        document = re.sub(hashtag, split, document)
    document = document.lower()
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

#### Aplicar preprocesamiento

In [6]:
data = pd.DataFrame()
data['text'] = df.text
data['processed'] = df.text.apply(preprocessing)
data['hatespeech'] = df.hatespeech.apply(lambda x: math.ceil(x/2))
data.shape

(135388, 3)

In [7]:
data.sample(10)[['hatespeech','text','processed']]

Unnamed: 0,hatespeech,text,processed
70569,0,You're far more likely to be gunned down at Wa...,you are far more likely to be gunned down at w...
27166,0,@masango1 Thats the slave mentality school of ...,user thats the slave mentality school of thoug...
15286,1,lol thot acts all loving yet tries to go out w...,lol thot acts all loving yet tries to go out w...
36083,0,"How about our homeless veterans, they don't ge...",how about our homeless veterans they do not ge...
15132,0,Satan is gathering all of them together... May...,satan is gathering all of them together may al...
116701,0,"I'm sure there were several others, but Islam ...",i am sure there were several others but islam ...
127428,0,There is plenty of evidence that women and non...,there is plenty of evidence that women and non...
37427,0,for people talking about how awkward or uncomf...,for people talking about how awkward or uncomf...
62289,1,"While I will not really *defend* the church, I...",while i will not really defend the church i ca...
43136,1,actually a little more 547 but regardless that...,actually a little more but regardless thats a ...


In [8]:
y_sample = data['hatespeech'].astype(float)
X_sample = data['processed'].astype(str)

X_train_validation, X_test, y_train_validation, y_test = train_test_split(X_sample.values, 
                                                    y_sample.values, 
                                                    random_state=42,test_size=0.2)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=42,test_size=0.125)

print('Number of rows in the total set: {}'.format(X_sample.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 135388
Number of rows in the training set: 94771
Number of rows in the validation set: 13539
Number of rows in the test set: 27078


In [9]:
#Genera la representación tf del vocabulario
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(X_train)
validation_data_tf = count_vector.transform(X_validation)
testing_data_tf = count_vector.transform(X_test)

training_validation_x_tf = np.concatenate((X_train,X_validation))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((y_train,y_validation))

In [10]:
# se normaliza la representación vectorial
training_data_tf_norm = Normalizer().fit_transform(training_data_tf)
validation_data_tf_norm = Normalizer().fit_transform(validation_data_tf)
testing_data_tf_norm = Normalizer().fit_transform(testing_data_tf)
cross_validation_data_tf_norm = Normalizer().fit_transform(cross_validation_x_tf)

In [11]:
#Genera la representación tfidf del vocabulario
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(X_train)
validation_data_tfidf = count_vector_tfidf.transform(X_validation)
testing_data_tfidf = count_vector_tfidf.transform(X_test)

training_validation_x_tfidf = np.concatenate((X_train,X_validation))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((y_train,y_validation))

In [12]:
# se normaliza la representación vectorial
training_data_tfidf_norm = Normalizer().fit_transform(training_data_tfidf) 
validation_data_tfidf_norm = Normalizer().fit_transform(validation_data_tfidf)
testing_data_tfidf_norm = Normalizer().fit_transform(testing_data_tfidf)
cross_validation_data_tfidf_norm = Normalizer().fit_transform(cross_validation_x_tfidf)

## Random Forest

In [14]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tf
clf_rf = RandomForestClassifier(max_depth=10, random_state=0)
scores_rf = cross_validate(clf_rf, cross_validation_data_tf_norm, cross_validation_y_tf, cv=5, scoring=('f1_macro','accuracy','precision_macro','recall_macro'))
scores_rf

{'fit_time': array([34.16673875, 34.42043376, 37.03759408, 37.1931653 , 39.48011065]),
 'score_time': array([0.49567389, 0.50863814, 0.53556895, 0.59241486, 0.54354477]),
 'test_f1_macro': array([0.59675233, 0.5922228 , 0.57995417, 0.594867  , 0.60029776]),
 'test_accuracy': array([0.69111809, 0.68779429, 0.68359339, 0.69047179, 0.69287231]),
 'test_precision_macro': array([0.79752028, 0.78994021, 0.80403243, 0.79990302, 0.79750812]),
 'test_recall_macro': array([0.62178679, 0.61829061, 0.6114582 , 0.62074128, 0.62411963])}

In [16]:
# Busqueda de hiperaparametros variando el alpha

params = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [6,8,10,12],
}

random_forest_grid = GridSearchCV(RandomForestClassifier(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
random_forest_grid.fit(training_data_tf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {random_forest_grid.best_score_}')
print('best parameter : ', random_forest_grid.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Accuracy Through Grid Search : 0.7012061427259215
best parameter :  {'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 500}


In [34]:
# Evaluación del mejor modelo
random_forest_best_1 = RandomForestClassifier(max_depth=12,max_features='sqrt', n_estimators=500, random_state=0)
scores = cross_validate(random_forest_best_1, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores


{'fit_time': array([231.67593455, 170.14602423, 155.98456144, 158.26661444,
        173.52192259, 161.16397071, 154.10320163, 151.75842309,
        166.09123468, 100.31335902]),
 'score_time': array([1.09706497, 0.83080888, 0.8526876 , 0.82579017, 0.82781768,
        0.82881761, 0.82334733, 0.87165666, 0.91557622, 0.94450545]),
 'test_accuracy': array([0.69891977, 0.70187425, 0.70492106, 0.69153356, 0.70242822,
        0.70409011, 0.69688856, 0.70353615, 0.69891977, 0.70058166]),
 'test_precision': array([0.92792109, 0.91612903, 0.92280453, 0.910086  , 0.92560175,
        0.9424944 , 0.92155369, 0.92748368, 0.92158327, 0.92324723]),
 'test_recall': array([0.27865117, 0.29111617, 0.29681093, 0.26514806, 0.28906606,
        0.28747153, 0.27562642, 0.29134396, 0.28109339, 0.28496583]),
 'test_f1': array([0.42859646, 0.44183232, 0.44915546, 0.41065444, 0.44054852,
        0.44056554, 0.42433807, 0.4434044 , 0.43079071, 0.43550914])}

In [32]:
# Evaluación del modelo

predictions = random_forest_best_1.predict(testing_data_tf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.6960262944087451
Precision score:  0.922429906542056
Recall score:  0.27058393493557525
F1 score:  0.4184271885819261


In [18]:
pred = random_forest_best_1.predict(testing_data_tf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.67      0.98      0.79     16135
         1.0       0.92      0.27      0.42     10943

    accuracy                           0.70     27078
   macro avg       0.79      0.63      0.61     27078
weighted avg       0.77      0.70      0.64     27078



(15886, 249, 7982, 2961)

In [35]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tfidf

clf_rf_2 = RandomForestClassifier(max_depth=10, random_state=0)
scores = cross_validate(clf_rf_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores

{'fit_time': array([28.76288509, 24.29397607, 25.30530691, 46.4389472 , 37.47428966,
        23.02441645, 23.10136962, 23.41798401, 25.96352077, 14.02371335]),
 'score_time': array([0.16356373, 0.16356397, 0.17951298, 0.42502356, 0.1695478 ,
        0.15857744, 0.16160011, 0.1605978 , 0.16558886, 0.24135041]),
 'test_accuracy': array([0.68322408, 0.68451667, 0.69328779, 0.68534761, 0.68774813,
        0.68257779, 0.69024097, 0.70048934, 0.68996399, 0.68470132]),
 'test_precision': array([0.94434137, 0.93632287, 0.93064516, 0.91894198, 0.92567568,
        0.95333333, 0.93160967, 0.93343419, 0.92225859, 0.92800702]),
 'test_recall': array([0.2319435 , 0.23781321, 0.26287016, 0.2453303 , 0.24965831,
        0.22801822, 0.25444191, 0.28109339, 0.25671982, 0.24077449]),
 'test_f1': array([0.37241632, 0.37929155, 0.40994671, 0.38727077, 0.3932544 ,
        0.36801471, 0.39971372, 0.43207283, 0.40163934, 0.38234762])}

In [21]:
# Busqueda de hiperaparametros variando el alpha
params = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [6,8,10,12],
}
random_forest_grid = GridSearchCV(RandomForestClassifier(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
random_forest_grid.fit(training_data_tfidf_norm,y_train)



Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Accuracy Through Grid Search : 0.6997182789124718
best parameter :  {'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 200}


In [27]:
print(f'Best Accuracy Through Grid Search : {random_forest_grid.best_score_}')
print('best parameter : ', random_forest_grid.best_params_)

Best Accuracy Through Grid Search : 0.6997182789124718
best parameter :  {'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 200}


In [36]:
# Evaluación del mejor modelo
random_forest_best_2 = RandomForestClassifier(max_depth=12,max_features='sqrt', n_estimators=200, random_state=0)
random_forest_best_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(random_forest_best_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores

{'fit_time': array([103.18501425,  74.94973254,  68.93974233,  67.24169517,
         62.57850933,  62.71221471,  63.77370358,  62.23944807,
         53.56067204,  27.97713399]),
 'score_time': array([0.40990376, 0.37200284, 0.35899973, 0.35803986, 0.39494205,
        0.36598277, 0.35009503, 0.33909249, 0.32217002, 0.30720592]),
 'test_accuracy': array([0.69061029, 0.70058166, 0.70178192, 0.6936571 , 0.69624227,
        0.69864278, 0.69688856, 0.70048934, 0.6936571 , 0.69762718]),
 'test_precision': array([0.92892562, 0.91229331, 0.92647059, 0.91104294, 0.92177914,
        0.94400631, 0.9221968 , 0.93343419, 0.92072214, 0.92395437]),
 'test_recall': array([0.25609478, 0.28906606, 0.28701595, 0.27061503, 0.2738041 ,
        0.27266515, 0.27539863, 0.28109339, 0.26719818, 0.27676538]),
 'test_f1': array([0.40150027, 0.43902439, 0.43826087, 0.41728135, 0.42219881,
        0.42311771, 0.42413612, 0.43207283, 0.41419492, 0.42594216])}

In [31]:
# Evaluación del modelo

predictions = random_forest_best_2.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.7010488219218554
Precision score:  0.9225519287833828
Recall score:  0.2841085625514027
F1 score:  0.4344302382449521


In [30]:
pred = random_forest_best_2.predict(testing_data_tfidf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.67      0.98      0.80     16135
         1.0       0.92      0.28      0.43     10943

    accuracy                           0.70     27078
   macro avg       0.80      0.63      0.62     27078
weighted avg       0.77      0.70      0.65     27078



(15874, 261, 7834, 3109)

In [37]:
pickle.dump(random_forest_best_2, open('salida/random_forest.model', 'wb'))
pickle.dump(count_vector_tfidf, open('salida/random_forest.vector', 'wb'))