In [1]:
import pandas as pd


# Cargar los conjuntos de datos
summaries_train = pd.read_csv('./commonlit-evaluate-student-summaries/summaries_train.csv')
prompts_train = pd.read_csv('./commonlit-evaluate-student-summaries/prompts_train.csv')
summaries_test = pd.read_csv('./commonlit-evaluate-student-summaries/summaries_test.csv')
prompts_test = pd.read_csv('./commonlit-evaluate-student-summaries/prompts_test.csv')

In [2]:
train_data = pd.merge(summaries_train, prompts_train, on="prompt_id")
test_data = pd.merge(summaries_test, prompts_test, on="prompt_id")

In [3]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [4]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

train_data['combined_text'] = train_data['text'] + ' ' + train_data['prompt_text']
test_data['combined_text'] = test_data['text'] + ' ' + test_data['prompt_text']

train_data['cleaned_text'] = train_data['combined_text'].apply(clean_text)
test_data['cleaned_text'] = test_data['combined_text'].apply(clean_text)

In [26]:
import xgboost as xgb

y_train_content = train_data['content']
y_train_wording = train_data['wording']

model_content = xgb.XGBRegressor()
model_wording = xgb.XGBRegressor()

model_content.fit(X_train, y_train_content)
model_wording.fit(X_train, y_train_wording)

In [27]:
predictions_test_content = model_content.predict(X_test)
predictions_test_wording = model_wording.predict(X_test)

test_data['predicted_content'] = predictions_test_content
test_data['predicted_wording'] = predictions_test_wording

In [5]:
from sklearn.model_selection import train_test_split

X = train_data['cleaned_text']
y_content = train_data['content']
y_wording = train_data['wording']

X_train_content, X_val_content, y_train_content, y_val_content = train_test_split(X, y_content, test_size=0.2, random_state=42)
X_train_wording, X_val_wording, y_train_wording, y_val_wording = train_test_split(X, y_wording, test_size=0.2, random_state=42)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_content_vec = vectorizer.fit_transform(X_train_content)
X_val_content_vec = vectorizer.transform(X_val_content)

X_train_wording_vec = vectorizer.fit_transform(X_train_wording)
X_val_wording_vec = vectorizer.transform(X_val_wording)

In [9]:
import xgboost as xgb

model_content = xgb.XGBRegressor()
model_wording = xgb.XGBRegressor()

model_content.fit(X_train_content_vec, y_train_content)
model_wording.fit(X_train_wording_vec, y_train_wording)

# Hacer predicciones en el conjunto de validación
predictions_val_content = model_content.predict(X_val_content_vec)
predictions_val_wording = model_wording.predict(X_val_wording_vec)

In [31]:
import numpy as np

def mcrmse(y_true, y_pred):
    # Calcula el error cuadrático medio para cada columna
    mse_content = np.mean((y_true[0] - y_pred[0]) ** 2)
    mse_wording = np.mean((y_true[1] - y_pred[1]) ** 2)
    
    # Calcula la raíz cuadrada y luego el promedio
    rmse_content = np.sqrt(mse_content)
    rmse_wording = np.sqrt(mse_wording)
    
    return (rmse_content + rmse_wording) / 2

# Calcular MCRMSE para el conjunto de validación
error = mcrmse([y_val_content, y_val_wording], [predictions_val_content, predictions_val_wording])
print("MCRMSE en el conjunto de validación:", error)

MCRMSE en el conjunto de validación: 0.5712857836199593


In [11]:
from sklearn.model_selection import GridSearchCV

# Definir el espacio de búsqueda
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid search para model_content
grid_search_content = GridSearchCV(model_content, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search_content.fit(X_train_content_vec, y_train_content)

# Obtener los mejores hiperparámetros para model_content
best_params_content = grid_search_content.best_params_
print("Mejores parámetros para model_content:", best_params_content)

In [12]:
# Grid search para model_wording
grid_search_wording = GridSearchCV(model_wording, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search_wording.fit(X_train_wording_vec, y_train_wording)

# Obtener los mejores hiperparámetros para model_wording
best_params_wording = grid_search_wording.best_params_
print("Mejores parámetros para model_wording:", best_params_wording)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [None]:
best_params_content = {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 150, 'subsample': 0.8}

In [None]:
# Actualizar los modelos con los mejores parámetros
model_content = xgb.XGBRegressor(**best_params_content)
model_wording = xgb.XGBRegressor(**best_params_wording)

# Entrenar los modelos con los datos
model_content.fit(X_train_content_vec, y_train_content)
model_wording.fit(X_train_wording_vec, y_train_wording)

# Hacer predicciones en el conjunto de validación
predictions_val_content = model_content.predict(X_val_content_vec)
predictions_val_wording = model_wording.predict(X_val_wording_vec)

In [None]:
# Calcular MCRMSE para el conjunto de validación
error = mcrmse([y_val_content, y_val_wording], [predictions_val_content, predictions_val_wording])
print("MCRMSE en el conjunto de validación:", error)