In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# Preprocesamiento de las columnas numéricas
def preprocess_numeric(df):
    for col in ['Search Engine Bid', 'Avg. Pos.', 'Impressions']:
        # Asegurar la correcta conversión de tipos de datos
        df[col] = df[col].astype(str).str.replace('$', '').str.replace(',', '').str.strip().replace('', np.nan)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

df_train = preprocess_numeric(df_train)
df_test = preprocess_numeric(df_test)

# Imputar los valores faltantes después de la conversión
imputer = SimpleImputer(strategy='median')
cols_to_impute = ['Impressions', 'Search Engine Bid', 'Avg. Pos.']

df_train[cols_to_impute] = imputer.fit_transform(df_train[cols_to_impute])
df_test[cols_to_impute] = imputer.transform(df_test[cols_to_impute])

# Creación de características polinómicas
poly_features = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly_features.fit_transform(df_train[['Search Engine Bid', 'Impressions', 'Avg. Pos.']])
X_test_poly = poly_features.transform(df_test[['Search Engine Bid', 'Impressions', 'Avg. Pos.']])

# Separación de la variable objetivo
y = df_train['Clicks'].str.replace(',', '').astype(float)  # Limpiar la columna 'Clicks' y convertir a float

X_train, X_valid, y_train, y_valid = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Definición y entrenamiento de modelos
models = {
    'Lasso': Lasso(alpha=0.1),
    'Ridge': Ridge(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f'{name} CV RMSE:', np.sqrt(-cv_score.mean()))
    
    # Evaluación en el conjunto de validación
    valid_preds = model.predict(X_valid)
    valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_preds))
    print(f'{name} Validation RMSE:', valid_rmse)





  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso CV RMSE: 1003.6780755668896
Lasso Validation RMSE: 2363.35520979625
Ridge CV RMSE: 1003.705268703985
Ridge Validation RMSE: 2363.284479968986
Random Forest CV RMSE: 744.6274038632181
Random Forest Validation RMSE: 993.5575819525551
Gradient Boosting CV RMSE: 938.9119927630569
Gradient Boosting Validation RMSE: 997.5700009870976


In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import numpy as np

# Aplicar PCA
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train)  # Asume que X_train ya está definido
X_valid_pca = pca.transform(X_valid)  # Asume que X_valid ya está definido

# Ajuste de Hiperparámetros de Random Forest
param_distributions_rf = {
    'n_estimators': [100, 200, 300],  
    'max_depth': [None, 10, 20],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}

random_search_rf = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_distributions_rf,
    n_iter=20, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42
)
random_search_rf.fit(X_train_pca, y_train)

# Hiperparámetros y RMSE para Random Forest
rf_best_model = random_search_rf.best_estimator_
rf_train_preds = rf_best_model.predict(X_train_pca)
rf_valid_preds = rf_best_model.predict(X_valid_pca)

rf_train_rmse = np.sqrt(mean_squared_error(y_train, rf_train_preds))
rf_valid_rmse = np.sqrt(mean_squared_error(y_valid, rf_valid_preds))

print("Random Forest - Train RMSE:", rf_train_rmse)
print("Random Forest - Validation RMSE:", rf_valid_rmse)
print("Random Forest - Best CV RMSE:", np.sqrt(-random_search_rf.best_score_))

# Ajuste de Hiperparámetros de Gradient Boosting
param_distributions_gb = {
    'n_estimators': [100, 200, 300],  
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],  
}

random_search_gb = RandomizedSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_distributions=param_distributions_gb,
    n_iter=20, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42
)
random_search_gb.fit(X_train_pca, y_train)

# Hiperparámetros y RMSE para Gradient Boosting
gb_best_model = random_search_gb.best_estimator_
gb_train_preds = gb_best_model.predict(X_train_pca)
gb_valid_preds = gb_best_model.predict(X_valid_pca)

gb_train_rmse = np.sqrt(mean_squared_error(y_train, gb_train_preds))
gb_valid_rmse = np.sqrt(mean_squared_error(y_valid, gb_valid_preds))

print("Gradient Boosting - Train RMSE:", gb_train_rmse)
print("Gradient Boosting - Validation RMSE:", gb_valid_rmse)
print("Gradient Boosting - Best CV RMSE:", np.sqrt(-random_search_gb.best_score_))

# Ajuste de Hiperparámetros de XGBoost
param_distributions_xgb = {
    'n_estimators': [100, 200, 300],  
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],  
}

random_search_xgb = RandomizedSearchCV(
    XGBRegressor(random_state=42),
    param_distributions=param_distributions_xgb,
    n_iter=20, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42
)
random_search_xgb.fit(X_train_pca, y_train)

# Hiperparámetros y RMSE para XGBoost
xgb_best_model = random_search_xgb.best_estimator_
xgb_train_preds = xgb_best_model.predict(X_train_pca)
xgb_valid_preds = xgb_best_model.predict(X_valid_pca)

xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_train_preds))
xgb_valid_rmse = np.sqrt(mean_squared_error(y_valid, xgb_valid_preds))

print("XGBoost - Train RMSE:", xgb_train_rmse)
print("XGBoost - Validation RMSE:", xgb_valid_rmse)
print("XGBoost - Best CV RMSE:", np.sqrt(-random_search_xgb.best_score_))

# Early Stopping para LightGBM
lgbm_model = LGBMRegressor(n_estimators=10000, random_state=42)
lgbm_model.fit(X_train_pca, y_train, eval_set=[(X_valid_pca, y_valid)], eval_metric='rmse')

# RMSE para LightGBM
lgbm_train_preds = lgbm_model.predict(X_train_pca)
lgbm_valid_preds = lgbm_model.predict(X_valid_pca)

lgbm_train_rmse = np.sqrt(mean_squared_error(y_train, lgbm_train_preds))
lgbm_valid_rmse = np.sqrt(mean_squared_error(y_valid, lgbm_valid_preds))

print("LightGBM - Train RMSE:", lgbm_train_rmse)
print("LightGBM - Validation RMSE:", lgbm_valid_rmse)







Random Forest - Train RMSE: 710.8267113116418
Random Forest - Validation RMSE: 889.828742298982
Random Forest - Best CV RMSE: 815.931420189223
Gradient Boosting - Train RMSE: 614.7444996140088
Gradient Boosting - Validation RMSE: 991.1386551971354
Gradient Boosting - Best CV RMSE: 976.508363744101
XGBoost - Train RMSE: 694.7252892883128
XGBoost - Validation RMSE: 1255.4107302307755
XGBoost - Best CV RMSE: 823.2820442772479
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 3528, number of used features: 1
[LightGBM] [Info] Start training from score 104.496882
LightGBM - Train RMSE: 884.3843779464726
LightGBM - Validation RMSE: 732.0301506102073


In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor

# Ajuste fino de hiperparámetros para Gradient Boosting (XGBoost)
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7, 1],
    'colsample_bytree': [0.5, 0.7, 1]
}
grid_search_xgb = GridSearchCV(XGBRegressor(n_estimators=100, random_state=42), param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_xgb.fit(X_train_pca, y_train)

# Ajuste fino de hiperparámetros para LightGBM
param_grid_lgbm = {
    'num_leaves': [31, 50, 100],
    'min_data_in_leaf': [20, 50, 100],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.5, 0.7, 1],
    'colsample_bytree': [0.5, 0.7, 1]
}
grid_search_lgbm = GridSearchCV(LGBMRegressor(n_estimators=100, random_state=42), param_grid=param_grid_lgbm, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_lgbm.fit(X_train_pca, y_train)

# Control del sobreajuste: Ensemble de modelos
best_xgb_model = grid_search_xgb.best_estimator_
best_lgbm_model = grid_search_lgbm.best_estimator_

estimators = [('xgb', best_xgb_model), ('lgbm', best_lgbm_model)]
stacked_model = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100, random_state=42))
stacked_model.fit(X_train_pca, y_train)

# Evaluación con Validación Cruzada
cv_score_stacked = cross_val_score(stacked_model, X_train_pca, y_train, cv=5, scoring='neg_mean_squared_error')
print("Stacked Model CV RMSE:", np.sqrt(-cv_score_stacked.mean()))

# Predicción en el conjunto de validación
stacked_valid_preds = stacked_model.predict(X_valid_pca)
stacked_valid_rmse = np.sqrt(mean_squared_error(y_valid, stacked_valid_preds))
print("Stacked Model Validation RMSE:", stacked_valid_rmse)

# Diferencia entre RMSE de entrenamiento y validación
xgb_train_preds = best_xgb_model.predict(X_train_pca)
xgb_valid_preds = best_xgb_model.predict(X_valid_pca)
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_train_preds))
xgb_valid_rmse = np.sqrt(mean_squared_error(y_valid, xgb_valid_preds))
print("XGBoost Train RMSE:", xgb_train_rmse)
print("XGBoost Validation RMSE:", xgb_valid_rmse)
print("Difference between XGBoost Train and Validation RMSE:", abs(xgb_train_rmse - xgb_valid_rmse))

lgbm_train_preds = best_lgbm_model.predict(X_train_pca)
lgbm_valid_preds = best_lgbm_model.predict(X_valid_pca)
lgbm_train_rmse = np.sqrt(mean_squared_error(y_train, lgbm_train_preds))
lgbm_valid_rmse = np.sqrt(mean_squared_error(y_valid, lgbm_valid_preds))
print("LightGBM Train RMSE:", lgbm_train_rmse)
print("LightGBM Validation RMSE:", lgbm_valid_rmse)
print("Difference between LightGBM Train and Validation RMSE:", abs(lgbm_train_rmse - lgbm_valid_rmse))

stacked_train_rmse = np.sqrt(-cv_score_stacked.mean())
print("Difference between Stacked Model CV RMSE and Validation RMSE:", abs(stacked_train_rmse - stacked_valid_rmse))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 3528, number of used features: 1
[LightGBM] [Info] Start training from score 104.496882
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 3528, number of used features: 1
[LightGBM] [Info] Start training from score 104.496882
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2822, number of used features: 1
[LightGBM] [Info] Start training

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2822, number of used features: 1
[LightGBM] [Info] Start training from score 103.427356
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2257, number of used features: 1
[LightGBM] [Info] Start training from score 104.317678
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2257, number of used features: 1
[LightGBM] [Info] Start training

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2823, number of used features: 1
[LightGBM] [Info] Start training from score 93.506908
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2258, number of used features: 1
[LightGBM] [Info] Start training from score 91.914526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2258, number of used features: 1
[LightGBM] [Info] Start training f

Stacked Model CV RMSE: 915.3231540985412
Stacked Model Validation RMSE: 951.3687433799953
XGBoost Train RMSE: 697.1398548497748
XGBoost Validation RMSE: 1324.412866893008
Difference between XGBoost Train and Validation RMSE: 627.2730120432332
LightGBM Train RMSE: 884.6430228825426
LightGBM Validation RMSE: 731.5518630462251
Difference between LightGBM Train and Validation RMSE: 153.0911598363175
Difference between Stacked Model CV RMSE and Validation RMSE: 36.04558928145411


In [20]:
# Cargar los datos de prueba desde un archivo CSV
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_test = pd.read_csv(ruta_test)

# Preprocesamiento de las columnas numéricas en los datos de prueba
df_test = preprocess_numeric(df_test)

# Aplicar las mismas transformaciones al conjunto de datos de prueba que se aplicaron al conjunto de entrenamiento
X_test_poly = poly_features.transform(df_test[['Search Engine Bid', 'Impressions', 'Avg. Pos.']])

# Aplicar PCA al conjunto de datos de prueba
X_test_pca = pca.transform(X_test_poly)  # Asumiendo que 'pca' ya está definido

# Realizar predicciones sobre el conjunto de datos de prueba utilizando el modelo entrenado
y_pred_test_lgbm = best_lgbm_model.predict(X_test_pca)

# Crear el DataFrame para el envío
submission_lgbm = pd.DataFrame({
    'entry_id': df_test['entry_id'],  # Asegúrate de que 'entry_id' está en el conjunto de prueba
    'Clicks': y_pred_test_lgbm
})

# Exportar el DataFrame a un archivo CSV para el envío
submission_filename_lgbm = 'lgbm_submission.csv'
submission_lgbm.to_csv(submission_filename_lgbm, index=False)

print(f"Archivo de submission creado: {submission_filename_lgbm}")



Archivo de submission creado: lgbm_submission.csv
