In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from scipy.special import softmax
import torch

from datasets import Dataset

from transformers import AutoConfig,Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
import os
import sys

sep = os.path.sep
dir_actual = os.path.abspath('')
PATH = sep.join(dir_actual.split(sep)[:-1])
DIR_DATA = PATH + '{0}Reto 3er{0}data{0}'.format(os.sep)
sys.path.append(PATH) if PATH not in list(sys.path) else None
DIR_DATA

In [3]:
filename = DIR_DATA + 'train.csv'
df_train = pd.read_csv(filename, sep = ',')



In [4]:
filename = DIR_DATA + 'test.csv'
df_test = pd.read_csv(filename, sep = ',')


In [5]:
filename = DIR_DATA + 'submission.csv'
df_submi = pd.read_csv(filename, sep = ',')

In [None]:
df_test.info()

In [7]:
df_train = df_train[['Comentario','Sentimiento']]

In [None]:
df_train

In [9]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

In [10]:
x_train= df_train['Comentario'].values
y_train= df_train['Sentimiento'].values

In [11]:
x_test= df_test['Comentario'].values
y_test= df_test['Sentimiento'].values


In [None]:
x_submi= df_submi['Comentario'].values
y_submi= df_submi['Sentimiento'].values

In [None]:
token_lens= []
for txt in x_train:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)
max_length

In [None]:
max_len = 600

In [None]:
def encode_sentences(sentences, max_len):
    
    encoding = tokenizer(sentences, truncation=True, padding=True, max_length=max_len, return_tensors='tf')
    return encoding

In [None]:
train_encodings = tokenizer(x_train.tolist(), truncation=True, padding=True, return_tensors="pt", max_length=512,)
eval_encodings = tokenizer(x_test.tolist(), truncation=True, padding=True, return_tensors="pt", max_length=512,)



In [None]:
roberta_model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment',)


In [None]:
label_map = {'negativo': 0, 'neutral': 1, 'positivo': 2}
y_train = [label_map[label] for label in y_train]
y_test = [label_map[label] for label in y_test]

In [None]:
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "label": y_train
}).with_format("torch")


In [None]:
eval_dataset = Dataset.from_dict({
    "input_ids": eval_encodings["input_ids"],
    "attention_mask": eval_encodings["attention_mask"],
    "label": y_test
}).with_format("torch")

In [None]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
training_args = TrainingArguments(
    output_dir='./Roberta-Trained',
    per_device_train_batch_size=18,
    num_train_epochs=5,
    weight_decay=0.01,
    seed=1,
    eval_strategy='steps',
    save_strategy='steps',
    save_steps=500,
    logging_dir='./logs',
    logging_steps=50,
    learning_rate=2e-5,
    warmup_steps=200,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

In [None]:
print("CUDA disponible:", torch.cuda.is_available())
print("Número de GPUs:", torch.cuda.device_count())

In [None]:
trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [None]:
trainer.train()

In [None]:
roberta_model = RobertaForSequenceClassification.from_pretrained("./Roberta-Trained/checkpoint-312")

In [None]:
def prediction_function(df) -> np.ndarray:
    with torch.no_grad():
        encoded_input = tokenizer(df.tolist(), truncation=True, padding=True, return_tensors="pt", max_length=512,)
        output = roberta_model(**encoded_input)
    return softmax(output['logits'].detach().numpy(), axis=1)

In [None]:
out = prediction_function(x_train)

In [None]:
predicted_labels = np.argmax(out, axis=1)

In [None]:

precision = precision_score(y_train, predicted_labels, average='weighted')
recall = recall_score(y_train, predicted_labels, average='weighted')
f1 = f1_score(y_train, predicted_labels, average='weighted')

print(f"Precisión (weighted): {precision:.6f}")
print(f"Exhaustividad (recall, weighted): {recall:.6f}")
print(f"F1 Score (weighted): {f1:.6f}")


In [None]:
out = prediction_function(x_test)

In [None]:
predicted_labels = np.argmax(out, axis=1)


In [None]:
precision = precision_score(y_test, predicted_labels,average='weighted' )
recall = recall_score(y_test, predicted_labels,average='weighted' )
f1 = f1_score(y_test, predicted_labels,average='weighted' )

print("Precisión:", precision)
print("recall:", recall)
print("F1 Score:", f1)

In [None]:
out = prediction_function(x_submi)


In [None]:
predicted_labels = np.argmax(out, axis=1)

In [None]:
predicted_labels

In [None]:
df_submi['Sentimiento'] = predicted_labels


In [None]:
df_submi

In [None]:
df_submi = df_submi[['ID', 'Sentimiento']]

In [None]:
df_submi.to_csv(filename, index=False)