In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from scipy.special import softmax
import torch

from datasets import Dataset

from transformers import AutoConfig,Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score





In [2]:
import os
import sys

sep = os.path.sep
dir_actual = os.path.abspath('')
PATH = sep.join(dir_actual.split(sep)[:-1])
DIR_DATA = PATH + '{0}Reto 3er{0}data{0}'.format(os.sep)
sys.path.append(PATH) if PATH not in list(sys.path) else None
DIR_DATA

'c:\\Users\\monto\\Documents\\Universidad\\IA\\Reto 3er\\data\\'

In [3]:
filename = DIR_DATA + 'train.csv'
df_train = pd.read_csv(filename, sep = ',')



In [4]:
filename = DIR_DATA + 'test.csv'
df_test = pd.read_csv(filename, sep = ',')


In [5]:
filename = DIR_DATA + 'submission.csv'
df_submi = pd.read_csv(filename, sep = ',')

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                351 non-null    int64  
 1   Sitio             351 non-null    object 
 2   Índice del lugar  351 non-null    int64  
 3   Nombre del lugar  351 non-null    object 
 4   Enlace del lugar  351 non-null    object 
 5   Municipio         351 non-null    object 
 6   Valoración        351 non-null    object 
 7   Valoraciones      351 non-null    object 
 8   Precio            351 non-null    object 
 9   Comentario        351 non-null    object 
 10  Fecha             351 non-null    object 
 11  Votos a favor     351 non-null    int64  
 12  Votos en contra   351 non-null    int64  
 13  Valoración_num    351 non-null    float64
 14  Sentimiento       351 non-null    object 
dtypes: float64(1), int64(4), object(10)
memory usage: 41.3+ KB


In [7]:
df_train = df_train[['Comentario','Sentimiento']]

In [8]:
df_train

Unnamed: 0,Comentario,Sentimiento
0,La isla es muy hermosa y tiene unas aguas muy ...,neutral
1,"Lahamburguesa de polloy elpatacónDoki, son del...",positivo
2,"La playa es preciosa, de aguas mansas y playas...",neutral
3,es un paraíso de playa Para llegar a la isla l...,neutral
4,Es muy buena escuela de buceo; excelente los l...,neutral
...,...,...
813,Lomitoa lasfinas hierbas... buenisimo!,positivo
814,Hicimos un excelente fun dive con Stefania y G...,neutral
815,El parque a su alrededor estaba en remodelació...,neutral
816,"La mejor experiencia en Coveñas, totalmente re...",neutral


In [9]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

In [10]:
x_train= df_train['Comentario'].values
y_train= df_train['Sentimiento'].values

In [11]:
x_test= df_test['Comentario'].values
y_test= df_test['Sentimiento'].values


In [12]:
x_submi= df_submi['Comentario'].values
y_submi= df_submi['Sentimiento'].values

In [13]:
token_lens= []
for txt in x_train:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)
max_length

np.int64(512)

In [14]:
max_len = 600

In [15]:
def encode_sentences(sentences, max_len):
    
    encoding = tokenizer(sentences, truncation=True, padding=True, max_length=max_len, return_tensors='tf')
    return encoding

In [16]:
train_encodings = tokenizer(x_train.tolist(), truncation=True, padding=True, return_tensors="pt", max_length=512,)
eval_encodings = tokenizer(x_test.tolist(), truncation=True, padding=True, return_tensors="pt", max_length=512,)



In [17]:
roberta_model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment',)


In [18]:
label_map = {'negativo': 0, 'neutral': 1, 'positivo': 2}
y_train = [label_map[label] for label in y_train]
y_test = [label_map[label] for label in y_test]

In [19]:
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "label": y_train
}).with_format("torch")


In [20]:
eval_dataset = Dataset.from_dict({
    "input_ids": eval_encodings["input_ids"],
    "attention_mask": eval_encodings["attention_mask"],
    "label": y_test
}).with_format("torch")

In [21]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [34]:
training_args = TrainingArguments(
    output_dir='./Roberta-lamonda2',
    per_device_train_batch_size=18,
    num_train_epochs=5,
    weight_decay=0.01,
    seed=1,
    eval_strategy='steps',
    save_strategy='steps',
    save_steps=500,
    logging_dir='./logs',
    logging_steps=50,
    learning_rate=2e-5,
    warmup_steps=200,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    lr_scheduler_type='cosine'
)

In [24]:
print("CUDA disponible:", torch.cuda.is_available())
print("Número de GPUs:", torch.cuda.device_count())

CUDA disponible: True
Número de GPUs: 1


In [35]:
trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [None]:
trainer.train()

In [28]:
roberta_model = RobertaForSequenceClassification.from_pretrained("./Roberta-lamonda/checkpoint-312")

In [30]:
def prediction_function(df) -> np.ndarray:
    with torch.no_grad():
        encoded_input = tokenizer(df.tolist(), truncation=True, padding=True, return_tensors="pt", max_length=512,)
        output = roberta_model(**encoded_input)
    return softmax(output['logits'].detach().numpy(), axis=1)

In [None]:
out = prediction_function(x_train)

In [None]:
predicted_labels = np.argmax(out, axis=1)

In [None]:

precision = precision_score(y_train, predicted_labels, average='weighted')
recall = recall_score(y_train, predicted_labels, average='weighted')
f1 = f1_score(y_train, predicted_labels, average='weighted')

print(f"Precisión (weighted): {precision:.6f}")
print(f"Exhaustividad (recall, weighted): {recall:.6f}")
print(f"F1 Score (weighted): {f1:.6f}")


Precisión (weighted): 0.979010
Exhaustividad (recall, weighted): 0.979218
F1 Score (weighted): 0.978843


In [31]:
out = prediction_function(x_test)

In [32]:
predicted_labels = np.argmax(out, axis=1)


In [33]:
precision = precision_score(y_test, predicted_labels,average='weighted' )
recall = recall_score(y_test, predicted_labels,average='weighted' )
f1 = f1_score(y_test, predicted_labels,average='weighted' )

print("Precisión:", precision)
print("recall:", recall)
print("F1 Score:", f1)

Precisión: 0.9440309862136488
recall: 0.9401709401709402
F1 Score: 0.9360421155292953


In [None]:
out = prediction_function(x_submi)


In [None]:
label_map = {0: 'negativo', 1: 'neutral', 2: 'positivo'}
predicted_labels = np.argmax(out, axis=1)
predicted_text_labels = [label_map[label] for label in predicted_labels]

In [None]:
predicted_text_labels

['neutral',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negativo',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negativo',
 'positivo',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'positivo',
 'positivo',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'neutral',
 'neutral',
 'positivo',
 'neutral',
 'positivo',
 'positivo',
 'neutral',
 'neutral

In [None]:
df_submi['Sentimiento'] = predicted_text_labels
df = df_submi['ID','Sentimiento'] 

ValueError: Length of values (818) does not match length of index (129)

In [None]:
df.to_csv(filename, index=False)