In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# Función de limpieza
def clean_numeric_column(column):
    column_as_str = column.astype(str).str.replace(',', '').str.replace('$', '').str.strip()
    return pd.to_numeric(column_as_str, errors='coerce')

# Limpieza de los datos
for column in ['Search Engine Bid', 'Impressions', 'Avg. Cost per Click', 'Avg. Pos.']:
    df_train[column] = clean_numeric_column(df_train[column])
    df_test[column] = clean_numeric_column(df_test[column])

# Preparación de los datos
X = df_train[['Search Engine Bid', 'Impressions', 'Avg. Pos.']] # Ajusta según tus columnas
y = df_train['Clicks'] # Ajusta según tu columna objetivo





In [2]:
# Escalado de los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test = df_test[['Search Engine Bid', 'Impressions', 'Avg. Pos.']] # Asegúrate de que las columnas coincidan
X_test_scaled = scaler.transform(X_test)

In [3]:
# Escalado de las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['Search Engine Bid', 'Impressions', 'Avg. Pos.']])
X_test_scaled = scaler.transform(X_test[['Search Engine Bid', 'Impressions', 'Avg. Pos.']])

# Reducir el modelo para prevenir sobreajuste
param_dist_gb = {
    'n_estimators': [50, 100],  # menos estimadores
    'max_depth': [3, 4],  # profundidades menores
    'min_samples_split': [4, 6],  # aumentar para regularización
    'min_samples_leaf': [3, 4],  # aumentar para regularización
    'learning_rate': [0.05, 0.1]  # tasa de aprendizaje ligeramente más baja
}

# Buscar los mejores hiperparámetros con RandomizedSearchCV
gb_model = GradientBoostingRegressor(random_state=42)
random_search_gb = RandomizedSearchCV(
    gb_model,
    param_distributions=param_dist_gb,
    n_iter=20,  # menos iteraciones
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42
)
random_search_gb.fit(X_train_scaled, y_train)

# Evaluar el modelo ajustado en el conjunto de entrenamiento y prueba
best_gb_model = random_search_gb.best_estimator_
y_train_pred = best_gb_model.predict(X_train_scaled)
y_test_pred = best_gb_model.predict(X_test_scaled)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Analizar los residuos
residuos = y_test - y_test_pred
plt.scatter(y_test_pred, residuos)
plt.axhline(0, linestyle='--', color='red')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.title('Residuals Analysis')
plt.show()

# Curvas de aprendizaje
train_sizes, train_scores, test_scores = learning_curve(
    best_gb_model,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='neg_mean_squared_error'
)

# Curvas de aprendizaje de plot
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training error')
plt.plot(train_sizes, test_scores_mean, label='Validation error')
plt.xlabel('Training set size')
plt.ylabel('MSE')
plt.title('Learning Curves')
plt.legend()
plt.show()

# Mostrar RMSE
print(f'Training set RMSE: {train_rmse}')
print(f'Test set RMSE: {test_rmse}')

# Importancia de las características
features = ['Search Engine Bid', 'Impressions', 'Avg. Pos.']
feature_importances = best_gb_model.feature_importances_
plt.barh(features, feature_importances)
plt.xlabel('Feature importance')
plt.title('Feature importance for GradientBoostingRegressor')
plt.show()

NameError: name 'X_train' is not defined

In [None]:
# Generación de predicciones para el conjunto de prueba
predictions = gb_model.predict(X_test_scaled)

# Creación del archivo CSV para la competencia de Kaggle
submission = pd.DataFrame({'entry_id': df_test['entry_id'], 'Clicks': predictions})
submission.to_csv('submission.csv', index=False)