In [54]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import StandardScaler

# Configurar las opciones de visualización de pandas (opcional)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Función para limpiar columnas numéricas
def clean_numeric_column(column):
    column_as_str = column.astype(str).str.replace(',', '').str.replace('$', '').str.strip()
    return pd.to_numeric(column_as_str, errors='coerce')


# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

columns_to_clean = ['Search Engine Bid', 'Impressions', 'Avg. Cost per Click', 'Avg. Pos.', 'Clicks']
for column in columns_to_clean:
    df_train[column] = clean_numeric_column(df_train[column])
    if column != 'Clicks':  # 'Clicks' no está en df_test
        df_test[column] = clean_numeric_column(df_test[column])
        

min_positive_value = df_train[df_train['Impressions'] > 0]['Impressions'].min()
df_train['Impressions'] = df_train['Impressions'].replace(0, min_positive_value)
df_train['Impressions'], fitted_lambda = stats.boxcox(df_train['Impressions'])

df_train['set'] = 'Not Kaggle'
df_test['set'] = 'Kaggle'

# Concatenar df_train y df_test en df_full
df_full = pd.concat([df_train, df_test], ignore_index=True)

In [56]:
from sklearn.pipeline import make_pipeline


# Preparar los datos para el modelo
X = df_train[['Impressions', 'Search Engine Bid']]  # Asegúrate de que estas columnas están limpias y no contienen NaNs
y = df_train['Clicks'].astype(float) 

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar un modelo Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

# Entrenar un modelo Lasso
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

# Calcular el error cuadrático medio para ambos modelos
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

rmse_ridge = sqrt(mse_ridge)
rmse_lasso = sqrt(mse_lasso)

print(f"RMSE Ridge: {rmse_ridge}, RMSE Lasso: {rmse_lasso}")

RMSE Ridge: 902.5467118401502, RMSE Lasso: 902.4850906916591


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# Definir el modelo y los parámetros para RandomizedSearchCV
rf = RandomForestRegressor()
param_dist_rf = {'n_estimators': [100, 200, 300, 400, 500],
                 'max_depth': [5, 10, 15, 20, None],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}

random_search_rf = RandomizedSearchCV(rf, param_distributions=param_dist_rf, n_iter=10, cv=5, random_state=42)
random_search_rf.fit(X_train, y_train)

# Calcular RMSE usando validación cruzada
rf_scores = cross_val_score(random_search_rf.best_estimator_, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
rf_rmse_scores = np.sqrt(-rf_scores)
