In [15]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# Preprocesamiento de las columnas numéricas
def preprocess_numeric(df):
    for col in ['Search Engine Bid', 'Avg. Pos.', 'Impressions']:
        # Asegurar la correcta conversión de tipos de datos
        df[col] = df[col].astype(str).str.replace('$', '').str.replace(',', '').str.strip().replace('', np.nan)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

df_train = preprocess_numeric(df_train)
df_test = preprocess_numeric(df_test)

# Imputar los valores faltantes después de la conversión
imputer = SimpleImputer(strategy='median')
cols_to_impute = ['Impressions', 'Search Engine Bid', 'Avg. Pos.']

df_train[cols_to_impute] = imputer.fit_transform(df_train[cols_to_impute])
df_test[cols_to_impute] = imputer.transform(df_test[cols_to_impute])

# Creación de características polinómicas
poly_features = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly_features.fit_transform(df_train[['Search Engine Bid', 'Impressions', 'Avg. Pos.']])
X_test_poly = poly_features.transform(df_test[['Search Engine Bid', 'Impressions', 'Avg. Pos.']])

# Separación de la variable objetivo
y = df_train['Clicks'].str.replace(',', '').astype(float)  # Limpiar la columna 'Clicks' y convertir a float

X_train, X_valid, y_train, y_valid = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Definición y entrenamiento de modelos
models = {
    'Lasso': Lasso(alpha=0.1),
    'Ridge': Ridge(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f'{name} CV RMSE:', np.sqrt(-cv_score.mean()))
    
    # Evaluación en el conjunto de validación
    valid_preds = model.predict(X_valid)
    valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_preds))
    print(f'{name} Validation RMSE:', valid_rmse)





  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso CV RMSE: 1003.6780755668896
Lasso Validation RMSE: 2363.35520979625
Ridge CV RMSE: 1003.705268703985
Ridge Validation RMSE: 2363.284479968986
Random Forest CV RMSE: 744.6274038632181
Random Forest Validation RMSE: 993.5575819525551
Gradient Boosting CV RMSE: 938.9119927630569
Gradient Boosting Validation RMSE: 997.5700009870976


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA

# Ajuste de Hiperparámetros con RandomizedSearchCV
# Random Forest
param_distributions_rf = {
    'n_estimators': [100, 200],  # Reducido para demostración
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
random_search_rf = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_distributions_rf, 
                                      n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search_rf.fit(X_train_pca, y_train)  # Asegúrate de usar X_train_pca

# Lasso y Ridge
param_distributions_lr = {'alpha': [0.01, 0.1, 1, 10]}
random_search_lr = RandomizedSearchCV(Lasso(random_state=42), param_distributions=param_distributions_lr, 
                                       n_iter=4, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search_lr.fit(X_train_pca, y_train)

random_search_rr = RandomizedSearchCV(Ridge(random_state=42), param_distributions=param_distributions_lr, 
                                       n_iter=4, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search_rr.fit(X_train_pca, y_train)

# Reducción de Dimensionalidad con PCA
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)

# Usando el mejor modelo de Random Forest para evaluar la importancia de las características
best_rf_model = random_search_rf.best_estimator_
feature_importances = best_rf_model.feature_importances_