# Sentence transformers

In [35]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch, pandas as pd, numpy as np

In [16]:
data_path = '../Hate speech/Competitions/HOMO-Mex/Datasets/Clean datasets/homomex_training.csv'
data = pd.read_csv(data_path)
data.head()

Unnamed: 0.1,Unnamed: 0,index,tweets,label
0,0,0,me quise ligar a una chava ayer y no me pelo l...,0
1,1,1,papaya rockera eres un punal papayita,0
2,2,2,magnate ofrece mdd al hombre que conquiste a s...,0
3,3,3,los trolebuses del desgobierno de epn son idio...,0
4,4,4,en epoca de hitler no se decia eres gay y si e...,0


In [17]:
model_1 = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
model = AutoModel.from_pretrained('hiiamsid/sentence_similarity_spanish_es')
tokenizer = AutoTokenizer.from_pretrained('hiiamsid/sentence_similarity_spanish_es')

In [36]:
def encode_sentence(sentence, model):
    embedding = model.encode([sentence])
    return np.array(embedding[0])

In [18]:
oraciones_ejemplo = ['Hola como estas', 'Me gusta jugar']
embeddings = model_1.encode(oraciones_ejemplo)
print(embeddings)

[[-0.00309469 -0.46023375 -0.4813428  ... -0.27846947  0.15598269
  -0.8559532 ]
 [-0.0842302  -1.6434591  -0.5594735  ... -0.14854005 -0.645719
   0.6444329 ]]


In [20]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [21]:
oraciones_ejemplo = ['Hola como estas', 'Me gusta jugar']
# Tokenize sentences
encoded_input = tokenizer(oraciones_ejemplo, 
                          padding=True, 
                          truncation=True, 
                          return_tensors='pt')

In [22]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

In [23]:
# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [37]:
data['vectors'] = data['tweets'].apply(encode_sentence, model=model_1)

In [38]:
data.head()

Unnamed: 0.1,Unnamed: 0,index,tweets,label,vectors
0,0,0,me quise ligar a una chava ayer y no me pelo l...,0,"[0.3846999, -0.12859814, -0.2732565, 0.0314256..."
1,1,1,papaya rockera eres un punal papayita,0,"[-0.3636112, 0.45495006, 0.8360649, 0.18059659..."
2,2,2,magnate ofrece mdd al hombre que conquiste a s...,0,"[0.27819762, 0.2836302, -0.7354787, 0.20708331..."
3,3,3,los trolebuses del desgobierno de epn son idio...,0,"[0.24197638, 0.23567112, 0.043693393, 0.337896..."
4,4,4,en epoca de hitler no se decia eres gay y si e...,0,"[0.64865184, -0.24659686, -0.29377174, 0.23340..."


In [39]:
X = data['vectors'].to_list()
y = data['label'].to_list()

## Classification

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [41]:
seed = 42
model = LogisticRegression(random_state = seed, 
                            penalty = 'l2', 
                            solver = 'liblinear', 
                            max_iter = 1000)
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  y, 
                                                  test_size=0.1, 
                                                  random_state=seed)
target_names = ['LGBT+phobic (P)', 'Not LGBT+phobic (NP)', 'Not LGBT+related (NR)']

In [42]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [43]:
y_pred = model.predict(X_val)

In [44]:
print(metrics.classification_report(y_val, y_pred, target_names=target_names))

                       precision    recall  f1-score   support

      LGBT+phobic (P)       0.51      0.38      0.44        94
 Not LGBT+phobic (NP)       0.82      0.87      0.84       432
Not LGBT+related (NR)       0.69      0.69      0.69       174

             accuracy                           0.76       700
            macro avg       0.67      0.65      0.66       700
         weighted avg       0.75      0.76      0.75       700

