In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# Corrección: Separar la limpieza de 'Clicks' solo para el conjunto de entrenamiento
def preprocess_df_general(df):
    # Limpiar columnas numéricas, excepto 'Clicks'
    numeric_cols = ['Search Engine Bid', 'Avg. Pos.', 'Avg. Cost per Click', 'Impressions']
    for col in numeric_cols:
        if df[col].dtype == object:
            df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')
    df['Impressions'].fillna(df['Impressions'].median(), inplace=True)

    # Procesamiento que aplica tanto al conjunto de entrenamiento como al de prueba
    tfidf_vectorizer = TfidfVectorizer(max_features=600)
    keywords_tfidf = tfidf_vectorizer.fit_transform(df['Keyword'].str.lower())
    kmeans = KMeans(n_clusters=10, random_state=42)
    keyword_clusters = kmeans.fit_predict(keywords_tfidf.toarray())
    df['Keyword Cluster'] = keyword_clusters
    df['Interaction'] = df['Keyword'].astype(str) + '_' + df['Match Type'].astype(str)
    bin_edges = [0, 100, 1000, 10000, np.inf]
    bin_labels = [1, 2, 3, 4]
    df['Impressions Category'] = pd.cut(df['Impressions'], bins=bin_edges, labels=bin_labels, right=False).cat.add_categories([0]).fillna(0).astype(int)
    return df

def preprocess_df_train(df):
    df = preprocess_df_general(df)
    # Limpiar y convertir 'Clicks' a numérico solo para el conjunto de entrenamiento
    df['Clicks'] = pd.to_numeric(df['Clicks'].str.replace(',', ''), errors='coerce')
    return df

df_train_cleaned = preprocess_df_train(df_train.copy())
df_test_cleaned = preprocess_df_general(df_test.copy())

selected_features = ['Search Engine Bid', 'Impressions Category', 'Avg. Pos.', 'Keyword Cluster']
X_train_cleaned = df_train_cleaned[selected_features]
y_train_cleaned = df_train_cleaned['Clicks'].astype(float)
X_train_cleaned.fillna(0, inplace=True)
X_train_with_const_cleaned = sm.add_constant(X_train_cleaned)
model_cleaned = sm.OLS(y_train_cleaned, X_train_with_const_cleaned)
results_cleaned = model_cleaned.fit()

print(results_cleaned.summary())

# Preparar las variables para el modelo RandomForestRegressor
X_train_with_topics = df_train_cleaned[selected_features]
y_train_with_topics = df_train_cleaned['Clicks']

# Ajustar el modelo sin los tópicos
model_rf_without_topics = RandomForestRegressor(n_estimators=100, random_state=42)
cv_scores_without_topics = cross_val_score(model_rf_without_topics, X_train_cleaned, y_train_cleaned,
                                           cv=5, scoring='neg_mean_squared_error')
cv_rmse_without_topics = np.sqrt(-cv_scores_without_topics)
cv_rmse_without_topics_mean = cv_rmse_without_topics.mean()

print("RMSE promedio del modelo sin tópicos:", cv_rmse_without_topics_mean)

# Ajustar el modelo con los tópicos
model_rf_with_topics = RandomForestRegressor(n_estimators=100, random_state=42)
cv_scores_with_topics = cross_val_score(model_rf_with_topics, X_train_with_topics, y_train_cleaned,
                                        cv=5, scoring='neg_mean_squared_error')
cv_rmse_with_topics = np.sqrt(-cv_scores_with_topics)
cv_rmse_with_topics_mean = cv_rmse_with_topics.mean()

print("RMSE promedio del modelo con tópicos:", cv_rmse_with_topics_mean)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cleaned.fillna(0, inplace=True)


                            OLS Regression Results                            
Dep. Variable:                 Clicks   R-squared:                       0.071
Model:                            OLS   Adj. R-squared:                  0.071
Method:                 Least Squares   F-statistic:                     84.74
Date:                Sun, 10 Mar 2024   Prob (F-statistic):           1.95e-69
Time:                        14:47:37   Log-Likelihood:                -36683.
No. Observations:                4410   AIC:                         7.338e+04
Df Residuals:                    4405   BIC:                         7.341e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 -688.9431 

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt

# Suponiendo que df_train_cleaned tiene una columna 'Topic' con la asignación de tópicos
plt.figure(figsize=(10, 6))
sns.boxplot(x='Topic', y='Clicks', data=df_train_cleaned)
plt.title('Distribución de Clicks por Tópico')
plt.show()


ValueError: Could not interpret input 'Topic'

<Figure size 1000x600 with 0 Axes>