In [1]:
import os
import pandas as pd
import kagglehub

# 1) Descarga y extracción
path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")

# 2) Lee el archivo CSV descargado
df = pd.read_csv(os.path.join(path, "Tweets.csv"))

# 3) Muestra las primeras filas del DataFrame
print("Primeras 5 filas del DataFrame:\n", df.head())

Primeras 5 filas del DataFrame:
              tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   

In [2]:
import os
import pandas as pd
import torch
import kagglehub
import re
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# --- Configuración global ---
MODEL_NAME = 'bert-base-multilingual-cased'
DATASET_ID = 'crowdflower/twitter-airline-sentiment'
CSV_FILENAME = 'Tweets.csv'
LABEL_MAPPING = {'negative': 0, 'neutral': 1, 'positive': 2}
RANDOM_SEED = 42

def normalize_columns(df):
    df.columns = [col.strip().lower() for col in df.columns]
    return df

def encode_labels(df, colname, label_map):
    df[colname] = df[colname].map(label_map)
    return df

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

def clean_text(text):
    # Elimina menciones, URLs y caracteres especiales
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.strip()

class AirlineSentimentDataset(torch.utils.data.Dataset):
    """Dataset para entrenamiento de sentiment con BERT."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

print("Descargando dataset de Kaggle...")
dataset_path = kagglehub.dataset_download(DATASET_ID)

df = pd.read_csv(os.path.join(dataset_path, CSV_FILENAME))
print("Primeras 5 filas del DataFrame:")
print(df.head())

df = normalize_columns(df)
df = df[['text', 'airline_sentiment']]
df['text'] = df['text'].apply(clean_text)
df = encode_labels(df, 'airline_sentiment', LABEL_MAPPING)
print("Distribución de etiquetas:\n", df['airline_sentiment'].value_counts())

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(),
    df['airline_sentiment'].tolist(),
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=df['airline_sentiment']
)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

train_dataset = AirlineSentimentDataset(train_encodings, train_labels)
test_dataset = AirlineSentimentDataset(test_encodings, test_labels)

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(LABEL_MAPPING))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_safetensors=False,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

print("Entrenando modelo...")
trainer.train()

Descargando dataset de Kaggle...
Primeras 5 filas del DataFrame:
             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin      

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Entrenando modelo...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
100,0.9294
200,0.7533
300,0.7361
400,0.7093
500,0.6717
600,0.6768
700,0.6501
800,0.6261
900,0.6674
1000,0.6619


TrainOutput(global_step=4392, training_loss=0.5099039894201283, metrics={'train_runtime': 669.6476, 'train_samples_per_second': 52.469, 'train_steps_per_second': 6.559, 'total_flos': 1065313339203456.0, 'train_loss': 0.5099039894201283, 'epoch': 3.0})