In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from scipy import stats

# Cargar los conjuntos de datos
ruta_train = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/traincase.csv'
ruta_test = 'C:/Users/Marcio Pineda/Documents/Archivos Python/datasets/testcase.csv'
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

# TF-IDF para 'Keyword'
df_train['Keyword'] = df_train['Keyword'].str.lower()
all_keywords = pd.concat([df_train['Keyword'], df_test['Keyword']], ignore_index=True)

# TF-IDF para 'Keyword'
tfidf_vectorizer = TfidfVectorizer(max_features=600)
keywords_tfidf = tfidf_vectorizer.fit_transform(all_keywords)
keywords_tfidf_df = pd.DataFrame(keywords_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenar las características TF-IDF con los datos originales para el conjunto de entrenamiento
df_train = pd.concat([df_train, keywords_tfidf_df.iloc[:len(df_train)]], axis=1)

# Función para limpiar columnas numéricas
def clean_numeric_column(column):
    if column.dtype == 'object':  # Verifica si la columna es de tipo objeto (string)
        column_as_str = column.str.replace(',', '').str.replace('$', '').str.strip()
        return pd.to_numeric(column_as_str, errors='coerce')
    else:
        return column

# Limpiar las columnas numéricas
for col in ['Search Engine Bid', 'Impressions', 'Avg. Cost per Click', 'Avg. Pos.', 'Clicks']:
    df_train[col] = clean_numeric_column(df_train[col])

# Manejo de valores nulos en 'Impressions'
df_train['Impressions'] = df_train['Impressions'].replace(0, np.nan)  # Reemplaza ceros con NaN

# Aplicar la transformación RobustScaler para manejar outliers en 'Impressions'
scaler = RobustScaler()
df_train['Impressions'] = scaler.fit_transform(df_train['Impressions'].values.reshape(-1, 1))

# Seleccionar características específicas para el modelo
selected_features = ['Search Engine Bid', 'Impressions', 'Avg. Pos.', 'air', 'airfare',
                     'airfrance', 'airline', 'com', 'france', 'ticket', 'to', 'travel',
                     'vacation']

# Preparar los datos para el modelado
X = df_train[selected_features]  # Selecciona solo las características específicas
y = df_train['Clicks']  # Preparar los datos para el modelado
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Análisis de Importancia de Características (Feature Importance)
# Utilizando Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Obtener la importancia de las características
feature_importance = rf.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]
for idx in sorted_idx:
    print(f'{selected_features[idx]}: {feature_importance[idx]}')

# Análisis de Correlación
correlation_matrix = df_train[selected_features + ['Clicks']].corr()
print("Correlation matrix:")
print(correlation_matrix)

# Pruebas estadísticas
for feature in selected_features:
    corr, p_value = stats.pearsonr(df_train[feature], df_train['Clicks'])
    print(f'{feature}: Correlation={corr}, p-value={p_value}')

Search Engine Bid: 0.43002641296810923
Impressions: 0.4227083833622379
Avg. Pos.: 0.046535268281085326
air: 0.03115633146261452
airfrance: 0.02037070577453283
france: 0.020092858962493784
to: 0.00984749398635821
vacation: 0.006389320504034879
com: 0.004901463360207892
airline: 0.003636961290231447
ticket: 0.0023152379593909165
travel: 0.0014462362332559639
airfare: 0.0005733258554471586
Correlation matrix:
                   Search Engine Bid  Impressions  Avg. Pos.       air  \
Search Engine Bid           1.000000     0.025286  -0.265718 -0.036112   
Impressions                 0.025286     1.000000  -0.004400 -0.006637   
Avg. Pos.                  -0.265718    -0.004400   1.000000 -0.011318   
air                        -0.036112    -0.006637  -0.011318  1.000000   
airfare                    -0.037123     0.002728  -0.022424 -0.093491   
airfrance                   0.006825     0.000701  -0.095242 -0.046593   
airline                    -0.050064     0.041874   0.071382 -0.077704  