In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from category_encoders import OneHotEncoder, TargetEncoder
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)


df_train['Keyword'] = df_train['Keyword'].str.lower()  # Convertir a minúsculas
all_keywords = df_train['Keyword']
tfidf_vectorizer = TfidfVectorizer(max_features=600)
df_train_tfidf = tfidf_vectorizer.fit_transform(all_keywords)
df_train_tfidf_df = pd.DataFrame(df_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df_train.index)

# Concatenar TF-IDF con los conjuntos de datos originales
df_train = pd.concat([df_train.drop(columns=['Keyword']), df_train_tfidf_df], axis=1)

# Función para limpiar columnas numéricas
def clean_numeric_column(column):
    # Elimina comas, signos de dólar, y espacios en blanco antes de convertir
    column_as_str = column.astype(str).str.replace(',', '').str.replace('$', '').str.strip()
    return pd.to_numeric(column_as_str, errors='coerce')
   

# Limpieza de columnas numéricas
columns_to_clean = ['Search Engine Bid', 'Impressions', 'Avg. Cost per Click', 'Avg. Pos.']
for column in columns_to_clean:
    df_train[column] = clean_numeric_column(df_train[column])

# Reemplazar ceros en 'Impressions' para la transformación Box-Cox
min_value = df_train[df_train['Impressions'] > 0]['Impressions'].min()
df_train['Impressions'] = df_train['Impressions'].replace(0, min_value)
fitted_lambda = stats.boxcox(df_train['Impressions'])[1]
df_train['Impressions'] = stats.boxcox(df_train['Impressions'], lmbda=fitted_lambda)

# Visualizar los primeros registros del conjunto de entrenamiento
df_train.head()

Unnamed: 0,entry_id,Publisher Name,Match Type,Campaign,Keyword Group,Category,Bid Strategy,Status,Search Engine Bid,Impressions,...,washington,web,webpage,website,welcome,xpress,yaoundé,york,zagreb,zurich
0,mkt_001,Google - Global,Broad,Air France Global Campaign,Nice,nice,Position 1- 3,Unavailable,1.25,5.535048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,mkt_002,Yahoo - US,Advanced,Western Europe Destinations,Munich,uncategorized,,Paused,6.25,3.097339,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,mkt_003,Overture - Global,Advanced,Unassigned,Unassigned,paris,Position 1-2 Target,Sent,0.45,7.067727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,mkt_005,Yahoo - US,Advanced,Geo Targeted Los Angeles,Discount International Los Angeles,uncategorized,,Paused,6.25,4.22387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,mkt_006,Google - US,,Google_Yearlong 2006,Google|marrakech,uncategorized,,Unavailable,7.5,2.842522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Asegúrate de que 'Clicks' también esté limpia si va a ser utilizada como tu variable objetivo
df_train['Clicks'] = clean_numeric_column(df_train['Clicks'])

# Dividir los datos en características (X) y objetivo (y)
# Asegúrate de no incluir columnas no deseadas en X
X = df_train.drop(columns=['Clicks', 'entry_id'], errors='ignore')

X = pd.concat([X, df_train_tfidf_df], axis=1)  # Añadir características TF-IDF
y = df_train['Clicks']

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
import category_encoders as ce

# Asume que has identificado las columnas para One-Hot Encoding y Target Encoding
one_hot_cols = ['Match Type', 'Bid Strategy', 'Status']
target_encode_cols = ['Campaign', 'Category', 'Publisher Name', 'Keyword Group']

preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', SklearnOneHotEncoder(), one_hot_cols),
        ('target_encode', ce.TargetEncoder(), target_encode_cols)
    ],
    remainder='passthrough'  # Mantener el resto de las columnas sin cambios
)


from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

model_pipeline.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error

# Dividir en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

y_pred = model_pipeline.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse}")




ValueError: could not convert string to float: '9,391'

In [7]:


# Asegúrate de que todas las columnas que se van a procesar con One-Hot Encoding o Target Encoding existen
one_hot_cols = [col for col in ['Match Type', 'Bid Strategy', 'Status'] if col in X.columns]
target_encode_cols = [col for col in ['Campaign', 'Category', 'Publisher Name', 'Keyword Group'] if col in X.columns]

# Aplicar One-Hot Encoding a variables con pocas categorías
one_hot_cols = ['Match Type', 'Bid Strategy', 'Status']
one_hot_encoder = OneHotEncoder(cols=one_hot_cols, use_cat_names=True)
X_train_one_hot = one_hot_encoder.fit_transform(X_train[one_hot_cols])
X_val_one_hot = one_hot_encoder.transform(X_val[one_hot_cols])

# Aplicar Target Encoding a variables con un número moderado de categorías
target_encode_cols = ['Campaign', 'Category', 'Publisher Name', 'Keyword Group']
target_encoder = TargetEncoder(cols=target_encode_cols)
X_train_target_encoded = target_encoder.fit_transform(X_train[target_encode_cols], y_train)
X_val_target_encoded = target_encoder.transform(X_val[target_encode_cols])

# Unir las transformaciones con el resto de datos (excluyendo las columnas originales que ya han sido codificadas)
X_train_processed = X_train.drop(one_hot_cols + target_encode_cols, axis=1).join(X_train_one_hot).join(X_train_target_encoded)
X_val_processed = X_val.drop(one_hot_cols + target_encode_cols, axis=1).join(X_val_one_hot).join(X_val_target_encoded)

# Ahora X_train_processed y X_val_processed están listos para ser usados en el modelado

print("Dimensiones de X_train procesado:", X_train_processed.shape)
print("Dimensiones de X_val procesado:", X_val_processed.shape)

print("Primeras filas de X_train procesado:\n", X_train_processed.head())
print("Primeras filas de X_val procesado:\n", X_val_processed.head())

print("Resumen estadístico de X_train procesado:\n", X_train_processed.describe())




NameError: name 'X_train' is not defined