In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

# Configurar las opciones de visualización de pandas (opcional)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Función para limpiar columnas numéricas
def clean_numeric_column(column):
    column_as_str = column.astype(str).str.replace(',', '').str.replace('$', '').str.strip()
    return pd.to_numeric(column_as_str, errors='coerce')


# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

columns_to_clean = ['Search Engine Bid', 'Impressions', 'Avg. Cost per Click', 'Avg. Pos.', 'Clicks']
for column in columns_to_clean:
    df_train[column] = clean_numeric_column(df_train[column])
    if column != 'Clicks':  # 'Clicks' no está en df_test
        df_test[column] = clean_numeric_column(df_test[column])

# Crear la variable 'Match Type_Exact'
df_train['Match Type_Exact'] = (df_train['Match Type'] == 'Exact').astype(int)
df_test['Match Type_Exact'] = (df_test['Match Type'] == 'Exact').astype(int)

# Marcar los conjuntos de datos para poder distinguirlos después de la concatenación
df_train['set'] = 'Not Kaggle'
df_test['set'] = 'Kaggle'

# Concatenar df_train y df_test en df_full
df_full = pd.concat([df_train, df_test], ignore_index=True)

In [2]:
# 'Keyword Length 6-10'
df_train['Keyword Length'] = df_train['Keyword'].apply(len)
df_test['Keyword Length'] = df_test['Keyword'].apply(len)
df_train['Keyword Length 6-10'] = ((df_train['Keyword Length'] >= 6) & (df_train['Keyword Length'] <= 10)).astype(int)
df_test['Keyword Length 6-10'] = ((df_test['Keyword Length'] >= 6) & (df_test['Keyword Length'] <= 10)).astype(int)

In [3]:
# Transformar 'Bid Strategy' y crear 'Bid Strategy Position 1-4'
bid_strategy_mapping = {
    "Pos 3-6": "Position 3-6",
    "Position 1 -2 Target": "Position 1-2",
    "Position 1-2 Target": "Position 1-2",
    "Position 1- 3": "Position 1-3",
    "Position 1-4 Bid Strategy": "Position 1-4",
    "Position 2-5 Bid Strategy": "Position 2-5",
    "Position 5-10 Bid Strategy": "Position 5-10",
    "Postiion 1-4 Bid Strategy": "Position 1-4"
}
df_train['Bid Strategy Grouped'] = df_train['Bid Strategy'].map(bid_strategy_mapping)
df_test['Bid Strategy Grouped'] = df_test['Bid Strategy'].map(bid_strategy_mapping)

df_train['Bid Strategy Position 1-4'] = (df_train['Bid Strategy Grouped'] == 'Position 1-4').astype(int)
df_test['Bid Strategy Position 1-4'] = (df_test['Bid Strategy Grouped'] == 'Position 1-4').astype(int)

In [4]:
# Preparar los datos para el modelado
X = df_train[['Match Type_Exact', 'Keyword Length 6-10', 'Bid Strategy Position 1-4', 'Impressions', 'Search Engine Bid']]
y = df_train['Clicks']

In [5]:
# Entrenar el modelo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [14]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from math import sqrt

# Entrenamiento y evaluación del RandomForestRegressor
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)
predictions_rf = model_rf.predict(X_test)
rmse_rf = sqrt(mean_squared_error(y_test, predictions_rf))
print("Random Forest Regressor RMSE:", rmse_rf)

# Entrenamiento y evaluación del GradientBoostingRegressor
model_gb = GradientBoostingRegressor(random_state=42)
model_gb.fit(X_train, y_train)
predictions_gb = model_gb.predict(X_test)
rmse_gb = sqrt(mean_squared_error(y_test, predictions_gb))
print("Gradient Boosting Regressor RMSE:", rmse_gb)

# Entrenamiento y evaluación del Support Vector Regressor
model_svr = SVR(kernel='rbf')  # Puedes cambiar el kernel y probar otros como 'linear', 'poly', etc.
model_svr.fit(X_train, y_train)
predictions_svr = model_svr.predict(X_test)
rmse_svr = sqrt(mean_squared_error(y_test, predictions_svr))
print("Support Vector Regressor RMSE:", rmse_svr)

Random Forest Regressor RMSE: 1124.466046656955
Gradient Boosting Regressor RMSE: 1290.7646634825085
Support Vector Regressor RMSE: 923.0677848151953


In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split


rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42),
                              param_grid={'n_estimators': [300], 'max_depth': [10]},
                              cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
y_pred_rf_grid = rf_grid_search.best_estimator_.predict(X_test)
rmse_rf_grid = sqrt(mean_squared_error(y_test, y_pred_rf_grid))
print(f"RandomForestRegressor GridSearchCV RMSE: {rmse_rf_grid}")

# Implementación de GradientBoostingRegressor con GridSearchCV
gb_grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42),
                              param_grid={'n_estimators': [300], 'learning_rate': [0.05], 'max_depth': [3]},
                              cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)
y_pred_gb_grid = gb_grid_search.best_estimator_.predict(X_test)
rmse_gb_grid = sqrt(mean_squared_error(y_test, y_pred_gb_grid))
print(f"GradientBoostingRegressor GridSearchCV RMSE: {rmse_gb_grid}")

RandomForestRegressor GridSearchCV RMSE: 1102.6584577520625
GradientBoostingRegressor GridSearchCV RMSE: 1288.3725661662384


In [16]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

# Parámetros óptimos encontrados
rf_params_grid = {'max_depth': 10, 'n_estimators': 300}
rf_params_random = {'n_estimators': 300, 'max_depth': 10}

gb_params_grid = {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}
gb_params_random = {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1}

# RandomForestRegressor con GridSearchCV params
rf_grid = RandomForestRegressor(**rf_params_grid, random_state=42)
rf_grid.fit(X_train, y_train)
y_pred_rf_grid = rf_grid.predict(X_test)
rmse_rf_grid = sqrt(mean_squared_error(y_test, y_pred_rf_grid))
print(f"RandomForestRegressor GridSearchCV RMSE: {rmse_rf_grid}")

# RandomForestRegressor con RandomizedSearchCV params
rf_random = RandomForestRegressor(**rf_params_random, random_state=42)
rf_random.fit(X_train, y_train)
y_pred_rf_random = rf_random.predict(X_test)
rmse_rf_random = sqrt(mean_squared_error(y_test, y_pred_rf_random))
print(f"RandomForestRegressor RandomizedSearchCV RMSE: {rmse_rf_random}")

# GradientBoostingRegressor con GridSearchCV params
gb_grid = GradientBoostingRegressor(**gb_params_grid, random_state=42)
gb_grid.fit(X_train, y_train)
y_pred_gb_grid = gb_grid.predict(X_test)
rmse_gb_grid = sqrt(mean_squared_error(y_test, y_pred_gb_grid))
print(f"GradientBoostingRegressor GridSearchCV RMSE: {rmse_gb_grid}")

# GradientBoostingRegressor con RandomizedSearchCV params
gb_random = GradientBoostingRegressor(**gb_params_random, random_state=42)
gb_random.fit(X_train, y_train)
y_pred_gb_random = gb_random.predict(X_test)
rmse_gb_random = sqrt(mean_squared_error(y_test, y_pred_gb_random))
print(f"GradientBoostingRegressor RandomizedSearchCV RMSE: {rmse_gb_random}")


RandomForestRegressor GridSearchCV RMSE: 1102.6584577520625
RandomForestRegressor RandomizedSearchCV RMSE: 1102.6584577520625
GradientBoostingRegressor GridSearchCV RMSE: 1288.3725661662384
GradientBoostingRegressor RandomizedSearchCV RMSE: 1294.2105540899393


In [10]:
# Definición de modelos y rangos de hiperparámetros para explorar
param_grid_rf = {
    'n_estimators': [300, 400, 500],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

param_grid_gb = {
    'n_estimators': [300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 0.9, 1.0]
}

# Búsqueda de hiperparámetros para RandomForestRegressor
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
rmse_rf = sqrt(mean_squared_error(y_test, y_pred_rf))
print(f'RandomForestRegressor Best Params: {grid_search_rf.best_params_}, RMSE: {rmse_rf}')

# Búsqueda de hiperparámetros para GradientBoostingRegressor
grid_search_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_gb.fit(X_train, y_train)
best_gb = grid_search_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)
rmse_gb = sqrt(mean_squared_error(y_test, y_pred_gb))
print(f'GradientBoostingRegressor Best Params: {grid_search_gb.best_params_}, RMSE: {rmse_gb}')


Fitting 5 folds for each of 72 candidates, totalling 360 fits
RandomForestRegressor Best Params: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 400}, RMSE: 953.252980295108
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
GradientBoostingRegressor Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 400, 'subsample': 1.0}, RMSE: 1142.4477120337842


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Definición de nuevos rangos de hiperparámetros para RandomForestRegressor
param_grid_rf_expanded = {
    'n_estimators': [400],  # Valor basado en resultados anteriores
    'max_depth': [30, 35, 40, None],  # Rango expandido
    'min_samples_split': [4, 5, 6],  # Ajuste fino
    'min_samples_leaf': [1],  # Valor basado en resultados anteriores
    'max_features': ['sqrt']  # Valor basado en resultados anteriores
}

# Asegúrate de que X_train y y_train estén definidos correctamente
# Ejemplo de uso de GridSearchCV con RandomForestRegressor
grid_search_rf_expanded = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf_expanded, cv=5, scoring='neg_mean_squared_error')
grid_search_rf_expanded.fit(X_train, y_train)

print(f"Mejores parámetros (GridSearchCV) para RandomForestRegressor: {grid_search_rf_expanded.best_params_}")

# Ejemplo de uso de RandomizedSearchCV con RandomForestRegressor
random_search_rf = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_grid_rf_expanded, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search_rf.fit(X_train, y_train)

print(f"Mejores parámetros (RandomizedSearchCV) para RandomForestRegressor: {random_search_rf.best_params_}")




In [18]:
import matplotlib.pyplot as plt

# Entrenar el modelo con los mejores parámetros encontrados
best_rf = grid_search_rf_expanded.best_estimator_

# Evaluar la importancia de las características
importancias = best_rf.feature_importances_
caracteristicas = X_train.columns

# Graficar la importancia de las características
plt.figure(figsize=(10, 6))
plt.barh(caracteristicas, importancias)
plt.xlabel('Importancia')
plt.ylabel('Característica')
plt.title('Importancia de las Características (RandomForest)')
plt.show()


NameError: name 'grid_search_rf_expanded' is not defined