In [0]:
import boto3

s3 = boto3.client("s3")

In [0]:
bucket_name = "s3-cyberbullying-classification-data"

# Lista objetos en el bucket
response = s3.list_objects_v2(Bucket=bucket_name)

# Mostrar los nombres de los archivos
if "Contents" in response:
    for obj in response["Contents"]:
        print(obj["Key"])
else:
    print("No se encontraron archivos o no tienes permisos")


In [0]:
%pip uninstall -y spacy
!pip install spacy==3.7.2
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import spacy

# ==============================
# 1. Cargar datos
# ==============================
key = "files/md5/c8/5e2d40bf87b27619f2a4c49fcb9cda"
obj = s3.get_object(Bucket=bucket_name, Key=key)

data = pd.read_csv(obj["Body"])


# ==============================
# 2. Preprocesamiento simple
# ==============================
def clean_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^a-záéíóúüñ\s]", "", text)
    return text.strip()

data["clean_text"] = data["tweet_text"].astype(str).apply(clean_tweet)
print("[LOG] Preprocesamiento completado. Ejemplo:", data["clean_text"].iloc[0])

# ==============================
# 3. TF-IDF por palabras (unigramas y bigramas)
# ==============================
word_vectorizer = TfidfVectorizer(
    max_features=7500,
    ngram_range=(1, 2),
    stop_words="english"
)
X_tfidf_words = word_vectorizer.fit_transform(data["clean_text"])
print("[LOG] TF-IDF de palabras listo. Dimensión:", X_tfidf_words.shape)

# ==============================
# 4. TF-IDF por caracteres (3–5-gramas)
# ==============================
char_vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    max_features=500
)
X_tfidf_chars = char_vectorizer.fit_transform(data["clean_text"])
print("[LOG] TF-IDF de caracteres listo. Dimensión:", X_tfidf_chars.shape)

# ==============================
# 5. Embeddings densos con spaCy
# ==============================
!python -m spacy download en_core_web_smimport spacy

nlp = spacy.load("en_core_web_sm")


def get_glove_embedding(text, embedding_dim=96):  # en_core_web_sm -> 96 dims
    doc = nlp(text)
    return doc.vector if doc.has_vector else np.zeros(embedding_dim)


X_glove = np.array([get_glove_embedding(t) for t in data["clean_text"]])
print("[LOG] Embeddings GloVe generados. Dimensión:", X_glove.shape)

# Escalar embeddings densos
scaler = StandardScaler()
X_glove_scaled = scaler.fit_transform(X_glove)
print("[LOG] Escalado de embeddings completado.")


# ==============================
# 6. Combinar todo
# ==============================
X_combined = hstack([X_tfidf_words, X_tfidf_chars, 2 * X_glove_scaled])
y = data["cyberbullying_type"]
print("[LOG] Matriz final lista. Dimensión:", X_combined.shape)


In [0]:
import joblib
joblib.dump(X_combined, "X_features.pkl")
joblib.dump(y, "y_labels.pkl")
joblib.dump(word_vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(char_vectorizer, "char_vectorizer.pkl")
joblib.dump(scaler, "scaler.pkl")

print("[LOG] Features y etiquetas guardadas exitosamente.")


In [0]:
!pip install xgboost
import mlflow
import mlflow.xgboost
import joblib
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from xgboost import XGBClassifier
import itertools
import time

# ==============================
# 1. Cargar features y etiquetas
# ==============================
X = joblib.load("X_features.pkl")
y = joblib.load("y_labels.pkl")
print("[LOG] Features y etiquetas cargadas:", X.shape, len(y))

# ==============================
# 2. Codificar etiquetas
# ==============================
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("[LOG] Etiquetas codificadas:", np.unique(y_encoded))

# ==============================
# 3. Dividir en train/test
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=55
)
print("[LOG] División train/test:", X_train.shape, X_test.shape)

# ==============================
# 4. Definir grid de hiperparámetros
# ==============================
param_grid = {
    "max_depth": [5, 7],
    "learning_rate": [0.01, 0.005],
    "n_estimators": [100, 200],
    "subsample": [0.8, 0.9]
}

# Generar todas las combinaciones posibles
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
print(f"[LOG] Total de combinaciones: {len(param_combinations)}")

# ==============================
# 5. Configurar experimento MLflow
# ==============================
mlflow.set_experiment("/Workspace/Users/c.palma@uniandes.edu.co/xgboost_gridsearch")


# Lista para guardar resultados y seleccionar la mejor combinación
results = []

# ==============================
# 6. Grid Search con MLflow
# ==============================
for i, params in enumerate(param_combinations):
    with mlflow.start_run(run_name=f"run_{i}") as run:
        print(f"\n[LOG] Entrenando combinación {i+1}: {params}")

        # Definir modelo XGBoost
        clf = XGBClassifier(
            objective="multi:softmax",
            n_jobs=-1,
            device="cuda",
            min_child_weight=4,
            gamma=0.01,
            reg_alpha=0,
            reg_lambda=1,
            **params
        )

        # Registrar parámetros en MLflow
        mlflow.log_params(params)

        # Entrenamiento
        start_time = time.time()
        clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        elapsed_time = time.time() - start_time
        mlflow.log_metric("training_time_sec", elapsed_time)

        # Evaluación usando recall_macro
        y_pred = clf.predict(X_test)
        recall = recall_score(y_test, y_pred, average="macro")
        mlflow.log_metric("recall_macro", recall)

        # Guardar modelo y label encoder
        mlflow.xgboost.log_model(clf, "xgboost_model")
        joblib.dump(label_encoder, "label_encoder.pkl")
        mlflow.log_artifact("label_encoder.pkl")

        print(f"[LOG] Combinación {i+1} terminada. Recall_macro={recall:.4f}, tiempo={elapsed_time:.2f}s")

        # Guardar resultados para análisis posterior
        results.append({"params": params, "recall": recall, "training_time_sec": elapsed_time})

# ==============================
# 7. Seleccionar mejor combinación
# ==============================
best_run = max(results, key=lambda x: x["recall"])
print("\n[LOG] Mejor combinación encontrada:")
print("Parámetros:", best_run["params"])
print("Recall_macro:", best_run["recall"])
print("Tiempo de entrenamiento (s):", best_run["training_time_sec"])
