#### Importación de librerias

In [51]:

import xml.etree.ElementTree as ET
import re, os, datasets, string, math
import pandas as pd
import numpy as np
import pickle
from gensim.parsing.porter import PorterStemmer 
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  confusion_matrix, classification_report
from sklearn import preprocessing as sk_preprocess


#### Importación del dataset

In [10]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-1d47093687320b66
Reusing dataset parquet (C:\Users\camilo\.cache\huggingface\datasets\ucberkeley-dlab___parquet\ucberkeley-dlab--measuring-hate-speech-1d47093687320b66\0.0.0\7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)
100%|██████████| 1/1 [00:00<00:00, 15.43it/s]


Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


#### funciones de procesamiento

In [11]:
def split_hashtag(token: str) -> str:
    """
    Split hashtags by uppercases
    """
    i, end = len(token) - 2, len(token) + 1
    response = ''
    while i >= 1:
        word = token[i:end]
        if word[0].isupper():
            response = token[i:end] + ' ' + response
            end = i
        i -= 1
    else:
        if end-1 > 0:
            response = token[1:end] + ' ' + response
    return(response.strip())

In [12]:
def expand_contractions(document: str) -> str:
    """
    Replace all abbreviations with their corresponding expansion
    """
    document = re.sub(r"'cause", "because", document)
    document = re.sub(r"o'clock", "of the clock", document)
    document = re.sub(r"won\'t", "will not", document)
    document = re.sub(r"can\'t", "can not", document)
    document = re.sub(r"n\'t", " not", document)
    document = re.sub(r"\'re", " are", document)
    document = re.sub(r"\'s", " is", document)
    document = re.sub(r"\'d", " would", document)
    document = re.sub(r"\'ll", " will", document)
    document = re.sub(r"\'t", " not", document)
    document = re.sub(r"\'ve", " have", document)
    document = re.sub(r"\'m", " am", document)
    return document

In [13]:
def preprocessing(document: str) -> list:
    """
    expand contractions
    replace any @user type with 'user' token
    split hashtags into words by upper case
    text to lower case
    remove new line char
    remove non latin chars
    remove extra blank spaces
    """
    document = expand_contractions(document)
    document = re.sub('@[a-zA-Z0-9_-]{0,150}', 'user', document)
    hashtags = re.findall('#[a-zA-Z0-9_-]{0,150}', document)
    for hashtag in hashtags:
        split = split_hashtag(hashtag)
        document = re.sub(hashtag, split, document)
    document = document.lower()
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

#### Aplicar preprocesamiento

In [14]:
data = pd.DataFrame()
data['text'] = df.text
data['processed'] = df.text.apply(preprocessing)
data['hatespeech'] = df.hatespeech.apply(lambda x: math.ceil(x/2))
data.shape

(135388, 3)

In [17]:
data.sample(10)[['hatespeech','text','processed']]

Unnamed: 0,hatespeech,text,processed
103215,1,DO YOU SEE HOW QUICK THE PROSECUTED THAT BLACK...,do you see how quick the prosecuted that black...
25636,0,@H Pn I'm sick because I don't support pedophi...,user pn i am sick because i do not support ped...
134079,0,"I, a Catholic and a Jesuit, am grateful for th...",i a catholic and a jesuit am grateful for this...
37265,0,I would not expect a bunch of 4chin autists to...,i would not expect a bunch of chin autists to ...
59472,0,. POLL: Should Illegal Immigrants Be Allowed ...,poll should illegal immigrants be allowed to ...
17817,0,girls who went to all girls schools are the fu...,girls who went to all girls schools are the fu...
67393,0,"@realDonaldTrump As a commander in the #IDF, I...",user as a commander in the i df i have been fi...
133760,0,I love that you enjoy being trans.,i love that you enjoy being trans
82196,1,"Begone, thot! Shariah is fucked, if you agree ...",begone thot shariah is fucked if you agree the...
1248,0,Don't get you panties in bunch. You stated you...,do not get you panties in bunch you stated you...


In [18]:
from sklearn.model_selection import train_test_split

y_sample = data['hatespeech'].astype(float)
X_sample = data['processed'].astype(str)

X_train_validation, X_test, y_train_validation, y_test = train_test_split(X_sample.values, 
                                                    y_sample.values, 
                                                    random_state=42,test_size=0.2)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=42,test_size=0.125)

print('Number of rows in the total set: {}'.format(X_sample.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 135388
Number of rows in the training set: 94771
Number of rows in the validation set: 13539
Number of rows in the test set: 27078


In [19]:
#Genera la representación tf del vocabulario
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(X_train)
validation_data_tf = count_vector.transform(X_validation)
testing_data_tf = count_vector.transform(X_test)

training_validation_x_tf = np.concatenate((X_train,X_validation))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((y_train,y_validation))

In [20]:
# se normaliza la representación vectorial
training_data_tf_norm = Normalizer().fit_transform(training_data_tf)
validation_data_tf_norm = Normalizer().fit_transform(validation_data_tf)
testing_data_tf_norm = Normalizer().fit_transform(testing_data_tf)
cross_validation_data_tf_norm = Normalizer().fit_transform(cross_validation_x_tf)

In [21]:
#Genera la representación tfidf del vocabulario
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(X_train)
validation_data_tfidf = count_vector_tfidf.transform(X_validation)
testing_data_tfidf = count_vector_tfidf.transform(X_test)

training_validation_x_tfidf = np.concatenate((X_train,X_validation))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((y_train,y_validation))

In [22]:
# se normaliza la representación vectorial
training_data_tfidf_norm = Normalizer().fit_transform(training_data_tfidf) 
validation_data_tfidf_norm = Normalizer().fit_transform(validation_data_tfidf)
testing_data_tfidf_norm = Normalizer().fit_transform(testing_data_tfidf)
cross_validation_data_tfidf_norm = Normalizer().fit_transform(cross_validation_x_tfidf)

## Logistic Regresion

In [45]:
# Entrenamiento del algoritmo de logisrtic regression y cross validation para la
# representación vectoria de tf

clf_log = LogisticRegression(max_iter=300,random_state=0,multi_class='ovr').fit(training_data_tf_norm, y_train)
scores = cross_validate(clf_log, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores

{'fit_time': array([4.29750252, 5.10612488, 5.07441926, 4.95067692, 5.40265417,
        4.23666358, 4.70640659, 6.0848043 , 5.24549866, 5.93212509]),
 'score_time': array([0.03291035, 0.03291082, 0.02992058, 0.03390861, 0.02493119,
        0.02792406, 0.03390884, 0.03490829, 0.02293897, 0.02493358]),
 'test_accuracy': array([0.8004801 , 0.80832795, 0.80998984, 0.80094174, 0.80675838,
        0.80934355, 0.80805096, 0.79891053, 0.79992614, 0.8111901 ]),
 'test_precision': array([0.79865952, 0.8057611 , 0.80895601, 0.79581568, 0.80327436,
        0.81057975, 0.80903985, 0.79699248, 0.80228447, 0.81887408]),
 'test_recall': array([0.67874231, 0.69453303, 0.69544419, 0.68451025, 0.6929385 ,
        0.69111617, 0.68906606, 0.676082  , 0.67198178, 0.68587699]),
 'test_f1': array([0.73383422, 0.74602398, 0.74791769, 0.73597845, 0.74403816,
        0.74609615, 0.74424899, 0.73157506, 0.73137474, 0.74649808])}

In [25]:
# Busqueda de hiperaparametros variando la regularización y el penalty
grid = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
}
logreg=LogisticRegression(max_iter=200,random_state=0,multi_class='ovr')
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(training_data_tf_norm,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in

tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
accuracy : 0.8063859730991412


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
# Evaluación del mejor modelo
logistic_regresion_best_1 = LogisticRegression(max_iter=300,random_state=0,multi_class='ovr',C=10,penalty='l2')
logistic_regresion_best_1.fit(training_data_tf_norm,y_train)
scores = cross_validate(logistic_regresion_best_1, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'fit_time': array([ 9.61408854,  9.32219434,  9.51023459,  9.19328904,  9.56540322,
         9.29934382,  9.8725853 , 10.01120162,  9.25423574, 10.16298342]),
 'score_time': array([0.03690124, 0.02194047, 0.02991891, 0.02992296, 0.02892351,
        0.02892399, 0.02891803, 0.02792048, 0.02592945, 0.03191519]),
 'test_accuracy': array([0.80232665, 0.80805096, 0.81349829, 0.80241898, 0.80795864,
        0.81257502, 0.81211338, 0.8064814 , 0.80528114, 0.81054381]),
 'test_precision': array([0.78142213, 0.78523821, 0.79346211, 0.77915633, 0.78476331,
        0.79618474, 0.79269202, 0.7887714 , 0.78954049, 0.79897698]),
 'test_recall': array([0.71109592, 0.72460137, 0.72984055, 0.71526196, 0.72505695,
        0.72255125, 0.72642369, 0.71366743, 0.70842825, 0.71161731]),
 'test_f1': array([0.74460217, 0.75370217, 0.76032273, 0.74584323, 0.75372958,
        0.75758299, 0.75811245, 0.74934226, 0.74678833, 0.75277108])}

In [47]:
# Evaluación del modelo

predictions = logistic_regresion_best_1.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.7943348844080065
Precision score:  0.7554182509505704
Recall score:  0.7262176733985196
F1 score:  0.7405302147882402


In [30]:
pred = logistic_regresion_best_1.predict(testing_data_tf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.81      0.87      0.84     16135
         1.0       0.78      0.71      0.75     10943

    accuracy                           0.80     27078
   macro avg       0.80      0.79      0.79     27078
weighted avg       0.80      0.80      0.80     27078



(14008, 2127, 3181, 7762)

In [48]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tfidf

clf_log_2 = LogisticRegression(max_iter=300,random_state=0,multi_class='ovr') \
                    .fit(training_data_tfidf_norm, y_train)
scores = cross_validate(clf_log_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores


{'fit_time': array([4.93479395, 5.65486503, 5.44425583, 6.14242268, 5.05447388,
        4.19976044, 4.34636807, 4.9677062 , 4.48400092, 5.64389825]),
 'score_time': array([0.04188967, 0.02792549, 0.02991962, 0.02692771, 0.03091717,
        0.02094507, 0.02792716, 0.03291106, 0.02293777, 0.03091812]),
 'test_accuracy': array([0.8004801 , 0.80832795, 0.80998984, 0.80094174, 0.80675838,
        0.80934355, 0.80805096, 0.79891053, 0.79992614, 0.8111901 ]),
 'test_precision': array([0.79865952, 0.8057611 , 0.80895601, 0.79581568, 0.80327436,
        0.81057975, 0.80903985, 0.79699248, 0.80228447, 0.81887408]),
 'test_recall': array([0.67874231, 0.69453303, 0.69544419, 0.68451025, 0.6929385 ,
        0.69111617, 0.68906606, 0.676082  , 0.67198178, 0.68587699]),
 'test_f1': array([0.73383422, 0.74602398, 0.74791769, 0.73597845, 0.74403816,
        0.74609615, 0.74424899, 0.73157506, 0.73137474, 0.74649808])}

In [35]:
# Busqueda de hiperaparametros variando la regularización y el penalty

grid = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7)
}
logreg=LogisticRegression(max_iter=300,random_state=0,multi_class='ovr')
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(training_data_tfidf_norm,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in

tuned hpyerparameters :(best parameters)  {'C': 1.0, 'penalty': 'l2'}
accuracy : 0.8060061105058096


In [49]:
# Evaluación del mejor modelo
logistic_regresion_best_2 = LogisticRegression(max_iter=300,random_state=0,multi_class='ovr',C=1,penalty='l2')
logistic_regresion_best_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(logistic_regresion_best_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision','recall','f1'))
scores



{'fit_time': array([4.0910542 , 5.67197466, 5.3793478 , 5.61598229, 5.59502792,
        5.58717704, 4.77009892, 5.42888474, 5.10434031, 5.57429504]),
 'score_time': array([0.034904  , 0.0289228 , 0.03390956, 0.03190565, 0.02692914,
        0.03390813, 0.03091693, 0.0289228 , 0.02693081, 0.02992105]),
 'test_accuracy': array([0.8004801 , 0.80832795, 0.80998984, 0.80094174, 0.80675838,
        0.80934355, 0.80805096, 0.79891053, 0.79992614, 0.8111901 ]),
 'test_precision': array([0.79865952, 0.8057611 , 0.80895601, 0.79581568, 0.80327436,
        0.81057975, 0.80903985, 0.79699248, 0.80228447, 0.81887408]),
 'test_recall': array([0.67874231, 0.69453303, 0.69544419, 0.68451025, 0.6929385 ,
        0.69111617, 0.68906606, 0.676082  , 0.67198178, 0.68587699]),
 'test_f1': array([0.73383422, 0.74602398, 0.74791769, 0.73597845, 0.74403816,
        0.74609615, 0.74424899, 0.73157506, 0.73137474, 0.74649808])}

In [42]:
# Evaluación del modelo

predictions = logistic_regresion_best_2.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.8041214269886993
Precision score:  0.800875040017074
Recall score:  0.6858265557890889
F1 score:  0.7388992812838436


In [41]:
pred = logistic_regresion_best_2.predict(testing_data_tfidf_norm)  
report = classification_report(y_test ,pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

              precision    recall  f1-score   support

         0.0       0.81      0.88      0.84     16135
         1.0       0.80      0.69      0.74     10943

    accuracy                           0.80     27078
   macro avg       0.80      0.79      0.79     27078
weighted avg       0.80      0.80      0.80     27078



(14269, 1866, 3438, 7505)

In [53]:
pickle.dump(logistic_regresion_best_1, open('salida/logistic_regression.model', 'wb'))
pickle.dump(count_vector, open('salida/logistic_regression.vector', 'wb'))