In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import f_oneway
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Descargas necesarias para NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# Definir funciones de preprocesamiento de texto
def preprocess_text(text):
    # Tokenización
    tokens = word_tokenize(text.lower())
    # Eliminación de stopwords y puntuación
    tokens = [token for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Preprocesamiento de la columna 'Keyword'
df_train['Preprocessed Keyword'] = df_train['Keyword'].apply(preprocess_text)

# Vectorización TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=600)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train['Preprocessed Keyword'])

# Modelado LDA
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(tfidf_matrix)

# Asignar a cada muestra el tópico más probable de LDA
df_train['Topic'] = lda_model.transform(tfidf_matrix).argmax(axis=1)

# Convertir la columna 'Topic' en variables dummy
df_train = pd.get_dummies(df_train, columns=['Topic'], drop_first=True)

# Definir kmeans
kmeans = KMeans(n_clusters=10, random_state=42)
# Ajustar el modelo KMeans
kmeans.fit(tfidf_matrix)

def preprocess_df_general(df, kmeans):
    # Limpiar columnas numéricas, excepto 'Clicks'
    numeric_cols = ['Search Engine Bid', 'Avg. Pos.', 'Avg. Cost per Click', 'Impressions']
    for col in numeric_cols:
        if df[col].dtype == object:
            df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')
    df['Impressions'].fillna(df['Impressions'].median(), inplace=True)

    # Procesamiento que aplica tanto al conjunto de entrenamiento como al de prueba
    keywords_tfidf = tfidf_vectorizer.transform(df['Keyword'].str.lower())
    keyword_clusters = kmeans.predict(keywords_tfidf)
    df['Keyword Cluster'] = keyword_clusters
    df['Interaction'] = df['Keyword'].astype(str) + '_' + df['Match Type'].astype(str)
    bin_edges = [0, 100, 1000, 10000, np.inf]
    bin_labels = [1, 2, 3, 4]
    df['Impressions Category'] = pd.cut(df['Impressions'], bins=bin_edges, labels=bin_labels, right=False).cat.add_categories([0]).fillna(0).astype(int)
    
    return df

def preprocess_df_train(df, kmeans):
    df = preprocess_df_general(df, kmeans)
    # Limpiar y convertir 'Clicks' a numérico solo para el conjunto de entrenamiento
    df['Clicks'] = pd.to_numeric(df['Clicks'].str.replace(',', ''), errors='coerce')
    return df

# Preprocesamiento de datos
df_train_cleaned = preprocess_df_train(df_train.copy(), kmeans)
df_test_cleaned = preprocess_df_general(df_test.copy(), kmeans)

selected_features = ['Search Engine Bid', 'Impressions Category', 'Avg. Pos.', 'Keyword Cluster']
X_train_cleaned = df_train_cleaned[selected_features]
y_train_cleaned = df_train_cleaned['Clicks'].astype(float)
X_train_cleaned.fillna(0, inplace=True)

# Ajustar el modelo de regresión lineal
model_cleaned = RandomForestRegressor(n_estimators=100, random_state=42)
model_cleaned.fit(X_train_cleaned, y_train_cleaned)

# Realizar cross-validation con el modelo de regresión lineal
cv_scores_cleaned = cross_val_score(model_cleaned, X_train_cleaned, y_train_cleaned, cv=5, scoring='neg_mean_squared_error')
cv_rmse_cleaned = np.sqrt(-cv_scores_cleaned)
cv_rmse_cleaned_mean = cv_rmse_cleaned.mean()

print("RMSE promedio del modelo limpio:", cv_rmse_cleaned_mean)

# Análisis de Valores Atípicos
# Investigar los casos de valores atípicos para determinar su naturaleza
outliers = df_train_cleaned[(np.abs(df_train_cleaned['Clicks'] - df_train_cleaned['Clicks'].mean()) > (3 * df_train_cleaned['Clicks'].std()))]
print("Casos de valores atípicos:")
print(outliers)

# Evaluación Estadística
# Prueba de ANOVA para determinar si las diferencias en los 'Clicks' entre los tópicos son significativas
anova_result = f_oneway(
    df_train_cleaned[df_train_cleaned['Topic_1'] == 1]['Clicks'],
    df_train_cleaned[df_train_cleaned['Topic_2'] == 1]['Clicks'],
    df_train_cleaned[df_train_cleaned['Topic_3'] == 1]['Clicks']
)

print("Resultados de ANOVA:", anova_result)




[nltk_data] Downloading package stopwords to C:\Users\Marcio
[nltk_data]     Pineda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Marcio
[nltk_data]     Pineda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Marcio
[nltk_data]     Pineda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cleaned.fillna(0, inplace=True)


RMSE promedio del modelo limpio: 746.8661522231882
Casos de valores atípicos:
      entry_id     Publisher Name          Keyword Match Type  \
5      mkt_007        Google - US       air france      Broad   
184   mkt_1082  Overture - Global    france travel   Advanced   
209   mkt_1105      Overture - US   airline ticket   Standard   
475   mkt_1349    Google - Global     [air france]      Exact   
592   mkt_1459        Google - US   air france com      Broad   
676   mkt_1536         Yahoo - US        airfrance   Advanced   
685   mkt_1544        Google - US       air france      Broad   
982   mkt_1818      Overture - US        airfrance   Standard   
983   mkt_1819      Overture - US    europe travel   Standard   
1314  mkt_2127        Google - US  flight to paris      Broad   
1453  mkt_2255         Yahoo - US        airfrance   Advanced   
1685  mkt_2468       MSN - Global       air france      Broad   
2084  mkt_2839        Google - US        airfrance      Broad   
2348  mkt_30

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Modelos RandomForestRegressor y GradientBoostingRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_gb = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Variables predictoras y variable objetivo
X_train = df_train_cleaned[selected_features]
y_train = df_train_cleaned['Clicks']

# Validación cruzada con RandomForestRegressor
cv_scores_rf = cross_val_score(model_rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_rf = np.sqrt(-cv_scores_rf)
cv_rmse_rf_mean = cv_rmse_rf.mean()

print("RMSE promedio del modelo RandomForestRegressor:", cv_rmse_rf_mean)

# Validación cruzada con GradientBoostingRegressor
cv_scores_gb = cross_val_score(model_gb, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_gb = np.sqrt(-cv_scores_gb)
cv_rmse_gb_mean = cv_rmse_gb.mean()

print("RMSE promedio del modelo GradientBoostingRegressor:", cv_rmse_gb_mean)


RMSE promedio del modelo RandomForestRegressor: 746.8661522231882
RMSE promedio del modelo GradientBoostingRegressor: 856.1206204153044


In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Definir el espacio de búsqueda de hiperparámetros para RandomForestRegressor
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Inicializar el modelo RandomForestRegressor
model_rf = RandomForestRegressor(random_state=42)

# Realizar la búsqueda de cuadrícula con validación cruzada para RandomForestRegressor
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Mejores hiperparámetros encontrados para RandomForestRegressor
best_params_rf = grid_search_rf.best_params_


