In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import f_oneway, ttest_ind
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder



# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# Corrección: Separar la limpieza de 'Clicks' solo para el conjunto de entrenamiento
def preprocess_df_general(df):
    # Limpiar columnas numéricas, excepto 'Clicks'
    numeric_cols = ['Search Engine Bid', 'Avg. Pos.', 'Avg. Cost per Click', 'Impressions']
    for col in numeric_cols:
        if df[col].dtype == object:
            df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')
    df['Impressions'].fillna(df['Impressions'].median(), inplace=True)

    # Procesamiento que aplica tanto al conjunto de entrenamiento como al de prueba
    tfidf_vectorizer = TfidfVectorizer(max_features=600)
    keywords_tfidf = tfidf_vectorizer.fit_transform(df['Keyword'].str.lower())
    kmeans = KMeans(n_clusters=10, random_state=42)
    keyword_clusters = kmeans.fit_predict(keywords_tfidf.toarray())
    df['Keyword Cluster'] = keyword_clusters
    df['Interaction'] = df['Keyword'].astype(str) + '_' + df['Match Type'].astype(str)
    bin_edges = [0, 100, 1000, 10000, np.inf]
    bin_labels = [1, 2, 3, 4]
    df['Impressions Category'] = pd.cut(df['Impressions'], bins=bin_edges, labels=bin_labels, right=False).cat.add_categories([0]).fillna(0).astype(int)
    
    # Crear la columna 'Topic'
    df['Topic'] = df['Keyword'].apply(lambda x: obtener_topico(x))  # Reemplaza 'obtener_topico' por la función que uses para obtener el tópico
    
    return df

# Corrección en la función obtener_topico
def obtener_topico(keyword):
    # Implementa la lógica para asignar tópicos reales basados en tus datos
    # Por ejemplo, puedes utilizar un modelo de clasificación para predecir el tópico
    # en función de la palabra clave
    return "Topico_Prueba"

def preprocess_df_train(df):
    df = preprocess_df_general(df)
    # Limpiar y convertir 'Clicks' a numérico solo para el conjunto de entrenamiento
    df['Clicks'] = pd.to_numeric(df['Clicks'].str.replace(',', ''), errors='coerce')
    return df

df_train_cleaned = preprocess_df_train(df_train.copy())
df_test_cleaned = preprocess_df_general(df_test.copy())

selected_features = ['Search Engine Bid', 'Impressions Category', 'Avg. Pos.', 'Keyword Cluster']
X_train_cleaned = df_train_cleaned[selected_features]
y_train_cleaned = df_train_cleaned['Clicks'].astype(float)
X_train_cleaned.fillna(0, inplace=True)
X_train_with_const_cleaned = sm.add_constant(X_train_cleaned)
model_cleaned = sm.OLS(y_train_cleaned, X_train_with_const_cleaned)
results_cleaned = model_cleaned.fit()

print(results_cleaned.summary())

# Preparar las variables para el modelo RandomForestRegressor
X_train_with_topics = df_train_cleaned[selected_features + ['Topic']]  # Agrega la columna 'Topic' como característica
y_train_with_topics = df_train_cleaned['Clicks']


# Ajustar el modelo sin los tópicos
model_rf_with_topics = RandomForestRegressor(n_estimators=100, random_state=42)
cv_scores_with_topics = cross_val_score(model_rf_with_topics, X_train_with_topics, y_train_with_topics,
                                        cv=5, scoring='neg_mean_squared_error')
cv_rmse_with_topics = np.sqrt(-cv_scores_with_topics)
cv_rmse_with_topics_mean = cv_rmse_with_topics.mean()

print("RMSE promedio del modelo sin tópicos:", cv_rmse_without_topics_mean)

# Explorar Interacciones
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X_train_with_topics[['Search Engine Bid', 'Topic']])

# Ajustar modelo con interacciones
model_interactions = RandomForestRegressor(n_estimators=100, random_state=42)
cv_scores_interactions = cross_val_score(model_interactions, X_interactions, y_train_with_topics,
                                         cv=5, scoring='neg_mean_squared_error')
cv_rmse_interactions = np.sqrt(-cv_scores_interactions)
cv_rmse_interactions_mean = cv_rmse_interactions.mean()

print("RMSE promedio del modelo con interacciones entre 'Search Engine Bid' y 'Topic':", cv_rmse_interactions_mean)

df_train_cleaned = pd.get_dummies(df_train_cleaned, columns=['Topic'])
df_test_cleaned = pd.get_dummies(df_test_cleaned, columns=['Topic'])

# Asegurarse de que ambas, train y test, tienen las mismas columnas dummy
df_train_cleaned, df_test_cleaned = df_train_cleaned.align(df_test_cleaned, join='inner', axis=1)

# Ajustar el modelo RandomForestRegressor con los tópicos ahora codificados como dummies
model_rf_with_topics = RandomForestRegressor(n_estimators=100, random_state=42)

# Asumiendo que has dividido tu df_train_cleaned en X_train_cleaned (características) y y_train_cleaned ('Clicks')
X_train_cleaned = df_train_cleaned.drop('Clicks', axis=1)  # O cualquier otra columna que represente el objetivo
y_train_cleaned = df_train_cleaned['Clicks']

# Ahora puedes realizar cross-validation con el modelo que incluye los tópicos como características
cv_scores_with_topics = cross_val_score(model_rf_with_topics, X_train_cleaned, y_train_cleaned, cv=5, scoring='neg_mean_squared_error')
cv_rmse_with_topics = np.sqrt(-cv_scores_with_topics)
cv_rmse_with_topics_mean = cv_rmse_with_topics.mean()

print("RMSE promedio del modelo con tópicos:", cv_rmse_with_topics_mean)

print("RMSE promedio del modelo con tópicos:", cv_rmse_with_topics_mean)

# Análisis de Valores Atípicos
# Investiga los casos de valores atípicos para determinar su naturaleza
outliers = df_train_cleaned[(np.abs(df_train_cleaned['Clicks'] - df_train_cleaned['Clicks'].mean()) > (3 * df_train_cleaned['Clicks'].std()))]
print("Casos de valores atípicos:")
print(outliers)

# Evaluación Estadística
# Prueba de ANOVA para determinar si las diferencias en los 'Clicks' entre los tópicos son significativas
anova_result = f_oneway(df_train_cleaned['Clicks'][df_train_cleaned['Topic'] == 'Topico1'],
                        df_train_cleaned['Clicks'][df_train_cleaned['Topic'] == 'Topico2'],
                        df_train_cleaned['Clicks'][df_train_cleaned['Topic'] == 'Topico3'])
print("Resultados de ANOVA:")
print(anova_result)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cleaned.fillna(0, inplace=True)


                            OLS Regression Results                            
Dep. Variable:                 Clicks   R-squared:                       0.071
Model:                            OLS   Adj. R-squared:                  0.071
Method:                 Least Squares   F-statistic:                     84.74
Date:                Sun, 10 Mar 2024   Prob (F-statistic):           1.95e-69
Time:                        15:15:13   Log-Likelihood:                -36683.
No. Observations:                4410   AIC:                         7.338e+04
Df Residuals:                    4405   BIC:                         7.341e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 -688.9431 

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\sklearn\ensemble\_forest.py", line 363, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\sklearn\utils\validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\sklearn\utils\validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\sklearn\utils\_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Marcio Pineda\Documents\Python\Lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Topico_Prueba'
