# ML 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# Cargar los datos de entrenamiento y prueba
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Preprocesamiento de los datos de entrenamiento
X_train = train[['user', 'item']]
y_train = train['rating']

# Convertir IDs categóricos a números únicos
X_train['user'] = X_train['user'].astype('category').cat.codes
X_train['item'] = X_train['item'].astype('category').cat.codes

# Aplicar el mismo preprocesamiento a los datos de prueba
X_test = test[['user', 'item']]
X_test['user'] = X_test['user'].astype('category').cat.codes
X_test['item'] = X_test['item'].astype('category').cat.codes

# Definir el modelo XGBoost
model = XGBRegressor(objective='reg:squarederror', seed=42)

# Definir el grid de parámetros para la búsqueda, ampliado para mayor búsqueda
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [6, 8],
    'learning_rate': [0.01, 0.05],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1],  # Regularización L1
    'reg_lambda': [1, 2],   # Regularización L2
}

# GridSearchCV con validación cruzada y early stopping
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

# Entrenamiento del modelo
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_

# Realizar predicciones sobre el conjunto de prueba
predictions = best_model.predict(X_test)

# Asegurarse que las predicciones estén en el rango correcto (1 a 10)
predictions = np.clip(np.round(predictions), 1, 10)

# Crear el archivo de salida
output = pd.DataFrame({'ID': test['ID'], 'rating': predictions})
output.to_csv('submission-ML3.csv.csv', index=False)

print('Proceso completado. Archivo submission-ML3.csv generado con éxito.')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['user'] = X_train['user'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['item'] = X_train['item'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['user'] = X_test['user'].astype('category').cat.codes
A value is trying 

Fitting 3 folds for each of 128 candidates, totalling 384 fits


KeyboardInterrupt: 

In [None]:
model = RandomForestRegressor(random_state=42)

# Definir el grid de parámetros para la búsqueda
param_grid = {
    'n_estimators': [100, 200,],  # Número de árboles
    'max_depth': [6, 8],      # Profundidad máxima de los árboles
    'min_samples_split': [2, 5],   # Mínimo número de muestras para dividir un nodo
    'max_features': ['auto', 'sqrt'], # Número de características a considerar en cada división
    'bootstrap': [True, False]         # Usar o no muestreo con reemplazo
}

# GridSearchCV con validación cruzada
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

# Entrenamiento del modelo
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_

# Realizar predicciones sobre el conjunto de prueba
predictions = best_model.predict(X_test)

# Asegurarse que las predicciones estén en el rango correcto (1 a 10)
predictions = np.clip(np.round(predictions), 1, 10)

# Crear el archivo de salida
output = pd.DataFrame({'ID': test['ID'], 'rating': predictions})
output.to_csv('submissi_rf.csv', index=False)

print('Proceso completado. Archivo predictions_rf.csv generado con éxito.')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['user'] = X_train['user'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['item'] = X_train['item'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['user'] = X_test['user'].astype('category').cat.codes
A value is trying 

Fitting 3 folds for each of 32 candidates, totalling 96 fits


48 fits failed out of a total of 96.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\anaconda3\envs\entornoVS\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\anaconda3\envs\entornoVS\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\User\anaconda3\envs\entornoVS\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\anaconda3\envs\entornoVS\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in va

Proceso completado. Archivo predictions_rf.csv generado con éxito.


# CONCLUSION

Aunque en general se espera que los algoritmos de Machine Learning tradicionales tengan un mejor rendimiento en datos tabulares, en este caso el enfoque adecuado depende completamente del objetivo específico de recomendar ítems a usuarios