# Lexicones

Se realizaran entrenamiento agregando caracteristicas extraidas de diferentes lexicones disponibles en la web junto con un lexicon creado a partir del set de entrenamiento

## SentiWordNet

In [1]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag, download

In [2]:
download('wordnet')
download('sentiwordnet')
download('omw-1.4')
download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to Wordnet tags
    """
    first_letter = tag[0]
    answers = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
    return answers.get(first_letter)

In [4]:
lemmatizer = WordNetLemmatizer()

def get_sentiment(word, tag):
    """ 
    returns list of pos neg and objective score. But returns empty list if not present in senti wordnet. 
    """
    wn_tag = penn_to_wn(tag)
    valid_wn_tags = (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB)
    if wn_tag not in valid_wn_tags: 
        return (0.0, 0.0, 1.0)

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma: 
        return (0.0, 0.0, 1.0)

    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets: 
        return (0.0, 0.0, 1.0)

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return (swn_synset.pos_score(), swn_synset.neg_score(), swn_synset.obj_score())

In [5]:
def lexicon_sentiwordnet(document: str) -> list:
    """
    return the lexicon features between others:
    - sum of positive, negative and objetive scores
    - mean of the sum of positive, negative and objetive scores
    - cant of words with positive, negative and objetive scores over 0.5
    """
    neg_scores, pos_scores, obj_scores= [], [], []
    words = word_tokenize(document)
    cant_words = len(words)
    pos_words = pos_tag(words)
    for word, tag in pos_words:
        scores = get_sentiment(word, tag)
        pos_scores.append(scores[0])
        neg_scores.append(scores[1])
        obj_scores.append(scores[2])
    pos_score, neg_score, obj_score = sum(pos_scores), sum(neg_scores), sum(obj_scores)
    pond_pos, pond_neg, pond_obj = pos_score/cant_words, neg_score/cant_words, obj_score/cant_words
    cant_pos, cant_neg, cant_obj = len([item for item in pos_scores if item >= 0.5]), len([item for item in neg_scores if item >= 0.5]), len([item for item in obj_scores if item >= 0.5])
    most_important = 1 if neg_score > pos_score else 0
    return [
        pos_score, neg_score, obj_score, 
        pond_pos, pond_neg, pond_obj, 
        cant_pos/cant_words, cant_neg/cant_words, cant_obj/cant_words, 
        cant_pos, cant_neg, cant_obj,
        most_important,
    ]

In [6]:
print(lexicon_sentiwordnet('this is good'))
print(lexicon_sentiwordnet('this is awful'))
print(lexicon_sentiwordnet('i love you so much'))
print(lexicon_sentiwordnet('i hate you so much'))
print(lexicon_sentiwordnet('Learn the fucking language you fucking useless immigrant.'))

[1.0, 0.125, 1.875, 0.3333333333333333, 0.041666666666666664, 0.625, 0.3333333333333333, 0.0, 0.6666666666666666, 1, 0, 2, 0]
[0.25, 1.0, 1.75, 0.08333333333333333, 0.3333333333333333, 0.5833333333333334, 0.0, 0.3333333333333333, 0.6666666666666666, 0, 1, 2, 1]
[0.5, 0.0, 4.5, 0.1, 0.0, 0.9, 0.2, 0.0, 1.0, 1, 0, 5, 0]
[0.0, 0.75, 4.25, 0.0, 0.15, 0.85, 0.0, 0.2, 0.8, 0, 1, 4, 1]
[0.625, 1.125, 7.25, 0.06944444444444445, 0.125, 0.8055555555555556, 0.0, 0.1111111111111111, 0.8888888888888888, 0, 1, 8, 1]


## Train the model

In [47]:
import datasets, re, math, pickle
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing as sk_preprocess

### Import the dataset

In [8]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.describe()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-1d47093687320b66
Reusing dataset parquet (C:\Users\Usuario\.cache\huggingface\datasets\ucberkeley-dlab___parquet\ucberkeley-dlab--measuring-hate-speech-1d47093687320b66\0.0.0\7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,hatespeech,hate_speech_score,infitms,outfitms,annotator_severity,std_err,annotator_infitms,annotator_outfitms,hypothesis,annotator_age
count,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,...,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0,135388.0
mean,23528.597283,5566.552523,1.281199,2.95428,2.828751,2.563152,2.278333,2.698784,1.845651,1.051666,...,0.744527,-0.567465,1.034367,1.001085,-0.018735,0.300572,1.007152,1.011709,0.014535,37.912799
std,12386.72463,3230.864231,1.023495,1.231662,1.309693,1.389984,1.370983,0.89843,1.402575,1.345656,...,0.932267,2.380312,0.496865,0.791993,0.487189,0.236407,0.269875,0.676072,0.613035,11.643449
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-8.34,0.1,0.07,-1.82,0.02,0.39,0.28,-1.578693,18.0
25%,18147.75,2720.0,0.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,...,0.0,-2.33,0.71,0.56,-0.38,0.03,0.81,0.67,-0.3411,29.0
50%,20052.0,5600.0,1.0,3.0,3.0,3.0,3.0,3.0,2.0,0.0,...,0.0,-0.34,0.96,0.83,-0.02,0.34,0.97,0.85,0.110405,35.0
75%,32035.0,8363.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,...,2.0,1.41,1.3,1.22,0.35,0.42,1.17,1.13,0.449555,45.0
max,50070.0,11142.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,2.0,6.3,5.9,9.0,1.36,1.9,2.01,9.0,0.987511,81.0


In [9]:
def split_hashtag(token: str) -> str:
    """
    split the hashtags by uppercase letters
    """
    i, end = len(token) - 2, len(token) + 1
    response = ''
    while i >= 1:
        word = token[i:end]
        if word[0].isupper():
            response = token[i:end] + ' ' + response
            end = i
        i -= 1
    else:
        if end-1 > 0:
            response = token[1:end] + ' ' + response
    return(response.strip())

In [10]:
def expand_contractions(document: str) -> str:
    """
    Replace all abbreviations with their corresponding expansion
    """
    document = re.sub(r"'cause", "because", document)
    document = re.sub(r"o'clock", "of the clock", document)
    document = re.sub(r"won\'t", "will not", document)
    document = re.sub(r"can\'t", "can not", document)
    document = re.sub(r"n\'t", " not", document)
    document = re.sub(r"\'re", " are", document)
    document = re.sub(r"\'s", " is", document)
    document = re.sub(r"\'d", " would", document)
    document = re.sub(r"\'ll", " will", document)
    document = re.sub(r"\'t", " not", document)
    document = re.sub(r"\'ve", " have", document)
    document = re.sub(r"\'m", " am", document)
    return document

In [11]:
def preprocessing(document: str) -> list:
    """
    expand contractions
    replace any @user type with 'user' token
    split hashtags into words by upper case
    text to lower case
    remove new line char
    remove non latin chars
    remove extra blank spaces
    """
    document = expand_contractions(document)
    document = re.sub('@[a-zA-Z0-9_-]{0,150}', 'user', document)
    hashtags = re.findall('#[a-zA-Z0-9_-]{0,150}', document)
    for hashtag in hashtags:
        split = split_hashtag(hashtag)
        document = re.sub(hashtag, split, document)
    document = document.lower()
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

In [12]:
def print_metrics(testing_label, predictions):
    """
    extrac the metrics from the testing labels and predictions
    """
    print('Precision score macro: ', format(precision_score(testing_label, predictions, average='macro')))
    print('Recall score macro: ', format(recall_score(testing_label, predictions, average='macro')))
    print('F1 score macro: ', format(f1_score(testing_label, predictions, average='macro')))
    print('Precision score macro: ', format(precision_score(testing_label, predictions, average='micro')))
    print('Recall score micro: ', format(recall_score(testing_label, predictions, average='micro')))
    print('F1 score micro: ', format(f1_score(testing_label, predictions, average='micro')))
    print('Accuracy: ', format(accuracy_score(testing_label, predictions)))
    print('--------------')

In [56]:
data = pd.DataFrame()
data['processed'] = df.text.apply(preprocessing)
data['hatespeech'] = df.hatespeech.apply(lambda x: math.ceil(x/2))
# data['hatespeech'] = df.hate_speech_score.apply(lambda x: 1 if x > -0.5 else 0)
data.shape

(135388, 2)

In [14]:
lexicon = pd.DataFrame.from_records(data.processed.apply(lexicon_sentiwordnet))

In [15]:
lexicon

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,3.250,1.750,47.000,0.062500,0.033654,0.903846,0.019231,0.019231,0.942308,1,1,49,0
1,1.250,0.125,8.625,0.125000,0.012500,0.862500,0.100000,0.000000,0.900000,1,0,9,0
2,1.000,2.125,39.875,0.023256,0.049419,0.927326,0.000000,0.046512,0.953488,0,2,41,1
3,0.625,0.500,22.875,0.026042,0.020833,0.953125,0.000000,0.000000,1.000000,0,0,24,0
4,0.375,0.375,26.250,0.013889,0.013889,0.972222,0.000000,0.000000,1.000000,0,0,27,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
135383,0.750,0.375,24.875,0.028846,0.014423,0.956731,0.000000,0.000000,1.000000,0,0,26,0
135384,0.375,1.000,27.625,0.012931,0.034483,0.952586,0.000000,0.034483,1.000000,0,1,29,1
135385,0.500,0.250,18.250,0.026316,0.013158,0.960526,0.000000,0.000000,1.000000,0,0,19,0
135386,0.375,1.000,27.625,0.012931,0.034483,0.952586,0.000000,0.034483,1.000000,0,1,29,1


In [57]:
y_sample = data['hatespeech'].astype(float)
X_sample = lexicon.astype(float) #data['lexicon'].astype(list)

X_train_validation, X_test, y_train_validation, y_test = train_test_split(X_sample.values, 
                                                    y_sample.values, 
                                                    random_state=42,test_size=0.2)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=42,test_size=0.125)

print('Number of rows in the total set: {}'.format(X_sample.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 135388
Number of rows in the training set: 94771
Number of rows in the validation set: 13539
Number of rows in the test set: 27078


In [58]:
params = {'alpha': [(i/10) for i in range(1,11)],}
multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(X_train, y_train)
alpha = multinomial_nb_grid.best_params_.get('alpha')

params = {'alpha': [(i/100) + alpha - 0.1 for i in range(1,11)] + [(i/100) + alpha for i in range(1,11)],}
multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(X_train, y_train)
alpha = multinomial_nb_grid.best_params_.get('alpha')

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Accuracy Through Grid Search : 0.5907714073865876
best parameter :  {'alpha': 0.36}


In [59]:
naive_bayes = MultinomialNB(alpha=alpha)
naive_bayes.fit(X_train,y_train)
pred = naive_bayes.predict(X_test)  
print_metrics(y_test, pred)

report = classification_report(y_test, pred)  
print(report)
#En terminos de TP, FP, TN, FN
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

Precision score macro:  0.5687441837225425
Recall score macro:  0.5645196659672183
F1 score macro:  0.5644593721439964
Precision score macro:  0.591845778861068
Recall score micro:  0.591845778861068
F1 score micro:  0.591845778861068
Accuracy:  0.591845778861068
--------------
              precision    recall  f1-score   support

         0.0       0.64      0.71      0.67     16135
         1.0       0.49      0.42      0.46     10943

    accuracy                           0.59     27078
   macro avg       0.57      0.56      0.56     27078
weighted avg       0.58      0.59      0.59     27078



(11408, 4727, 6325, 4618)

In [60]:
pickle.dump(naive_bayes, open('salida/swn_lexicon_naive_bayes.model', 'wb'))

In [54]:
def predictor(text):
    naive_bayes = pickle.load(open('salida/swn_lexicon_naive_bayes.model', 'rb'))
    data = lexicon_sentiwordnet(text)
    return 'Hate Speech' if naive_bayes.predict([data])[0] == 1 else 'Not Hate Speech'

In [55]:
predictor('everybody will die sooner or later')

'Hate Speech'

In [44]:
grid = {
    "C": np.logspace(-3,3,7), 
    "penalty": ["none", "l2"]
}
logreg = LogisticRegression(max_iter=1000, random_state=0, multi_class='ovr')
logreg_cv = GridSearchCV(logreg, grid, n_jobs=-1, cv=5, verbose=5)
logreg_cv.fit(X_train, y_train)
C = logreg_cv.best_params_.get('C')
penalty = logreg_cv.best_params_.get('penalty')

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
tuned hpyerparameters :(best parameters)  {'C': 1.0, 'penalty': 'l2'}
accuracy : 0.6265945855613392


In [45]:
logistic_regresion = LogisticRegression(max_iter=1000, random_state=0, multi_class='ovr', C=C, penalty=penalty)
logistic_regresion.fit(X_train, y_train)
pred = logistic_regresion.predict(X_test)  
print_metrics(y_test, pred)

report = classification_report(y_test, pred)  
print(report)
#En terminos de TP, FP, TN, FN
report = classification_report(y_test, pred)  
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
(tn,fp,fn,tp)

Precision score macro:  0.6245116196904977
Recall score macro:  0.6162536450895261
F1 score macro:  0.6125701595482325
Precision score macro:  0.6213531280005908
Recall score micro:  0.6213531280005908
F1 score micro:  0.6213531280005908
Accuracy:  0.6213531280005908
--------------
              precision    recall  f1-score   support

         0.0       0.64      0.49      0.55     12989
         1.0       0.61      0.74      0.67     14089

    accuracy                           0.62     27078
   macro avg       0.62      0.62      0.61     27078
weighted avg       0.62      0.62      0.61     27078



(6374, 6615, 3638, 10451)

# Dataset Metrics

In [21]:
def naive_bayes_prediction(training_data, testing_data, training_label):
    """
    get the naive bayes prediction from training and testing data
    """
    naive_bayes = MultinomialNB()
    naive_bayes.fit(training_data, training_label)
    return naive_bayes.predict(testing_data)

In [22]:
def logistic_regression_prediction(training_data, testing_data, training_label):
    """
    get the logistic regression prediction from training and testing data
    """
    logistic = LogisticRegression(random_state=0, multi_class='multinomial')
    logistic.fit(training_data, training_label)
    return logistic.predict(testing_data)

In [23]:
y = ['hatespeech', 'hate_speech_score']
columns = ['sentiment', 'respect', 'insult', 
           'humiliate', 'status', 'dehumanize', 
           'violence', 'genocide', 'attack_defend', 
           'hatespeech']

In [24]:
data = df[columns]
min_max_scaler = sk_preprocess.MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(data.values)
dataset = pd.DataFrame(data_scaled, columns=columns)
y_sample = dataset.hatespeech.apply(lambda x: int(x))
X_sample = dataset[columns]
X_sample = X_sample.drop(['hatespeech'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=0)
nb_features = naive_bayes_prediction(x_train, x_test, y_train.values)
print_metrics(y_test, nb_features)

Precision score macro:  0.7376187904647078
Recall score macro:  0.5792161831189642
F1 score macro:  0.5573721148778248
Precision score macro:  0.7069699879360859
Recall score micro:  0.7069699879360859
F1 score micro:  0.7069699879360859
Accuracy:  0.7069699879360859
--------------


In [25]:
print(X_sample.shape)
print(y_sample.shape)

(135388, 9)
(135388,)


In [26]:
data = df[columns]
min_max_scaler = sk_preprocess.MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(data.values)
dataset = pd.DataFrame(data_scaled, columns=columns)

In [27]:
y_sample = df.hate_speech_score.apply(lambda x: 1 if (x > 0) else 0)
X_sample = dataset[columns]
x_train, x_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=0)
nb_features = naive_bayes_prediction(x_train, x_test, y_train.values)
print_metrics(y_test, nb_features)

Precision score macro:  0.8046798113080479
Recall score macro:  0.7958342039904578
F1 score macro:  0.7984444642829169
Precision score macro:  0.803308959302755
Recall score micro:  0.803308959302755
F1 score micro:  0.803308959302755
Accuracy:  0.803308959302755
--------------
