In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import f_oneway
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Descargas necesarias para NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# Definir funciones de preprocesamiento de texto
def preprocess_text(text):
    # Tokenización
    tokens = word_tokenize(text.lower())
    # Eliminación de stopwords y puntuación
    tokens = [token for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Preprocesamiento de la columna 'Keyword'
df_train['Preprocessed Keyword'] = df_train['Keyword'].apply(preprocess_text)

# Vectorización TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=600)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train['Preprocessed Keyword'])

# Modelado LDA
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(tfidf_matrix)

# Obtención de los tópicos para cada keyword
topic_keywords = lda_model.components_
top_keyword_indices = topic_keywords.argsort(axis=1)[:, -5:]  # Obtener los índices de las 5 palabras clave principales para cada tópico

# Asignación de tópicos a cada keyword
topic_assignments = []
for idx in range(len(df_train)):
    keyword_indices = tfidf_matrix[idx].indices
    if len(keyword_indices) > 0:  # Verificar si la lista de índices no está vacía
        keyword_topics = [np.where(top_keyword_indices == keyword_idx)[0][0] for keyword_idx in keyword_indices]
        topic_assignments.append(keyword_topics)
    else:
        topic_assignments.append([])  # Si no hay índices, agregar una lista vacía

# Asignar los tópicos a cada keyword en el DataFrame
df_train['Topic'] = topic_assignments

def preprocess_df_general(df):
    # Limpiar columnas numéricas, excepto 'Clicks'
    numeric_cols = ['Search Engine Bid', 'Avg. Pos.', 'Avg. Cost per Click', 'Impressions']
    for col in numeric_cols:
        if df[col].dtype == object:
            df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')
    df['Impressions'].fillna(df['Impressions'].median(), inplace=True)

    # Procesamiento que aplica tanto al conjunto de entrenamiento como al de prueba
    keywords_tfidf = tfidf_vectorizer.transform(df['Keyword'].str.lower())
    keyword_clusters = kmeans.predict(keywords_tfidf)
    df['Keyword Cluster'] = keyword_clusters
    df['Interaction'] = df['Keyword'].astype(str) + '_' + df['Match Type'].astype(str)
    bin_edges = [0, 100, 1000, 10000, np.inf]
    bin_labels = [1, 2, 3, 4]
    df['Impressions Category'] = pd.cut(df['Impressions'], bins=bin_edges, labels=bin_labels, right=False).cat.add_categories([0]).fillna(0).astype(int)
    
    return df

def preprocess_df_train(df):
    df = preprocess_df_general(df)
    # Limpiar y convertir 'Clicks' a numérico solo para el conjunto de entrenamiento
    df['Clicks'] = pd.to_numeric(df['Clicks'].str.replace(',', ''), errors='coerce')
    return df

# Preprocesamiento de datos
df_train_cleaned = preprocess_df_train(df_train.copy())
df_test_cleaned = preprocess_df_general(df_test.copy())


selected_features = ['Search Engine Bid', 'Impressions Category', 'Avg. Pos.', 'Keyword Cluster']
X_train_cleaned = df_train_cleaned[selected_features]
y_train_cleaned = df_train_cleaned['Clicks'].astype(float)
X_train_cleaned.fillna(0, inplace=True)
X_train_with_const_cleaned = sm.add_constant(X_train_cleaned)
model_cleaned = sm.OLS(y_train_cleaned, X_train_with_const_cleaned)
results_cleaned = model_cleaned.fit()

print(results_cleaned.summary())

# Preparar las variables para el modelo RandomForestRegressor
X_train_with_topics = df_train_cleaned[selected_features + ['Topic']]  # Agrega la columna 'Topic' como característica
y_train_with_topics = df_train_cleaned['Clicks']


# Ajustar el modelo sin los tópicos
model_rf_with_topics = RandomForestRegressor(n_estimators=100, random_state=42)
cv_scores_with_topics = cross_val_score(model_rf_with_topics, X_train_with_topics, y_train_with_topics,
                                        cv=5, scoring='neg_mean_squared_error')
cv_rmse_with_topics = np.sqrt(-cv_scores_with_topics)
cv_rmse_with_topics_mean = cv_rmse_with_topics.mean()

print("RMSE promedio del modelo sin tópicos:", cv_rmse_without_topics_mean)

# Explorar Interacciones
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X_train_with_topics[['Search Engine Bid', 'Topic']])

# Ajustar modelo con interacciones
model_interactions = RandomForestRegressor(n_estimators=100, random_state=42)
cv_scores_interactions = cross_val_score(model_interactions, X_interactions, y_train_with_topics,
                                         cv=5, scoring='neg_mean_squared_error')
cv_rmse_interactions = np.sqrt(-cv_scores_interactions)
cv_rmse_interactions_mean = cv_rmse_interactions.mean()

print("RMSE promedio del modelo con interacciones entre 'Search Engine Bid' y 'Topic':", cv_rmse_interactions_mean)

df_train_cleaned = pd.get_dummies(df_train_cleaned, columns=['Topic'])
df_test_cleaned = pd.get_dummies(df_test_cleaned, columns=['Topic'])

# Asegurarse de que ambas, train y test, tienen las mismas columnas dummy
df_train_cleaned, df_test_cleaned = df_train_cleaned.align(df_test_cleaned, join='inner', axis=1)

# Ajustar el modelo RandomForestRegressor con los tópicos ahora codificados como dummies
model_rf_with_topics = RandomForestRegressor(n_estimators=100, random_state=42)

# Asumiendo que has dividido tu df_train_cleaned en X_train_cleaned (características) y y_train_cleaned ('Clicks')
X_train_cleaned = df_train_cleaned.drop('Clicks', axis=1)  # O cualquier otra columna que represente el objetivo
y_train_cleaned = df_train_cleaned['Clicks']

# Ahora puedes realizar cross-validation con el modelo que incluye los tópicos como características
cv_scores_with_topics = cross_val_score(model_rf_with_topics, X_train_cleaned, y_train_cleaned, cv=5, scoring='neg_mean_squared_error')
cv_rmse_with_topics = np.sqrt(-cv_scores_with_topics)
cv_rmse_with_topics_mean = cv_rmse_with_topics.mean()

print("RMSE promedio del modelo con tópicos:", cv_rmse_with_topics_mean)

print("RMSE promedio del modelo con tópicos:", cv_rmse_with_topics_mean)

# Análisis de Valores Atípicos
# Investiga los casos de valores atípicos para determinar su naturaleza
outliers = df_train_cleaned[(np.abs(df_train_cleaned['Clicks'] - df_train_cleaned['Clicks'].mean()) > (3 * df_train_cleaned['Clicks'].std()))]
print("Casos de valores atípicos:")
print(outliers)

# Evaluación Estadística
# Prueba de ANOVA para determinar si las diferencias en los 'Clicks' entre los tópicos son significativas
anova_result = f_oneway(df_train_cleaned['Clicks'][df_train_cleaned['Topic'] == 'Topico1'],
                        df_train_cleaned['Clicks'][df_train_cleaned['Topic'] == 'Topico2'],
                        df_train_cleaned['Clicks'][df_train_cleaned['Topic'] == 'Topico3'])
print("Resultados de ANOVA:")
print(anova_result)

[nltk_data] Downloading package stopwords to C:\Users\Marcio
[nltk_data]     Pineda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Marcio
[nltk_data]     Pineda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Marcio
[nltk_data]     Pineda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


IndexError: index 0 is out of bounds for axis 0 with size 0