In [2]:
import pandas as pd
import numpy as np

### 1. Cargamos Datos

In [3]:
dataset = pd.read_csv("../data/raw/train.csv")

dataset.head()

Unnamed: 0,CustomerID,Age,Gender,Income,VisitFrequency,AverageSpend,PreferredCuisine,TimeOfVisit,GroupSize,DiningOccasion,MealType,OnlineReservation,DeliveryOrder,LoyaltyProgramMember,WaitTime,ServiceRating,FoodRating,AmbianceRating,HighSatisfaction
0,1457,34,Female,138842,Weekly,162.954929,Indian,Dinner,8,Celebration,Dine-in,1,0,1,4.228618,1,5,3,1
1,1371,54,Male,98671,Rarely,66.918873,Indian,Lunch,4,Casual,Takeaway,0,0,0,50.247186,1,2,2,0
2,1505,47,Male,122351,Monthly,94.12767,Indian,Lunch,7,Casual,Takeaway,0,1,0,10.174873,3,3,5,0
3,1011,38,Male,78868,Weekly,92.705568,Mexican,Dinner,4,Business,Takeaway,0,0,1,14.237746,4,1,4,0
4,1016,50,Female,128686,Monthly,166.931144,American,Dinner,8,Business,Dine-in,0,0,0,56.319628,4,3,4,0


### 2. Exploración de datos

In [4]:
dataset.describe

<bound method NDFrame.describe of       CustomerID  Age  Gender  Income VisitFrequency  AverageSpend  \
0           1457   34  Female  138842         Weekly    162.954929   
1           1371   54    Male   98671         Rarely     66.918873   
2           1505   47    Male  122351        Monthly     94.127670   
3           1011   38    Male   78868         Weekly     92.705568   
4           1016   50  Female  128686        Monthly    166.931144   
...          ...  ...     ...     ...            ...           ...   
1813        1127   68  Female  109615         Weekly    188.333292   
1814        1084   25    Male   94123         Weekly    127.012832   
1815        2153   51    Male  148333         Weekly    171.119498   
1816        1195   34  Female   51451          Daily    198.605492   
1817        1138   50  Female   83461         Weekly    168.548238   

     PreferredCuisine TimeOfVisit  GroupSize DiningOccasion  MealType  \
0              Indian      Dinner          8    Cele

In [7]:
dataset.dtypes

CustomerID                int64
Age                       int64
Gender                   object
Income                    int64
VisitFrequency           object
AverageSpend            float64
PreferredCuisine         object
TimeOfVisit              object
GroupSize                 int64
DiningOccasion           object
MealType                 object
OnlineReservation         int64
DeliveryOrder             int64
LoyaltyProgramMember      int64
WaitTime                float64
ServiceRating             int64
FoodRating                int64
AmbianceRating            int64
HighSatisfaction          int64
dtype: object

### 3. Eliminamos variables no útiles

In [8]:
dataset.drop(["CustomerID", "PreferredCuisine", "Gender", "TimeOfVisit", "AverageSpend", "Age"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Income,VisitFrequency,GroupSize,DiningOccasion,MealType,OnlineReservation,DeliveryOrder,LoyaltyProgramMember,WaitTime,ServiceRating,FoodRating,AmbianceRating,HighSatisfaction
0,138842,Weekly,8,Celebration,Dine-in,1,0,1,4.228618,1,5,3,1
1,98671,Rarely,4,Casual,Takeaway,0,0,0,50.247186,1,2,2,0
2,122351,Monthly,7,Casual,Takeaway,0,1,0,10.174873,3,3,5,0
3,78868,Weekly,4,Business,Takeaway,0,0,1,14.237746,4,1,4,0
4,128686,Monthly,8,Business,Dine-in,0,0,0,56.319628,4,3,4,0


### 4. Ingeniería de Características

In [9]:
dataset.isnull().mean()

Income                  0.0
VisitFrequency          0.0
GroupSize               0.0
DiningOccasion          0.0
MealType                0.0
OnlineReservation       0.0
DeliveryOrder           0.0
LoyaltyProgramMember    0.0
WaitTime                0.0
ServiceRating           0.0
FoodRating              0.0
AmbianceRating          0.0
HighSatisfaction        0.0
dtype: float64

In [10]:
columnas_categoricas = ['VisitFrequency', 'DiningOccasion', 'MealType']  # Lista de columnas categóricas

for col in columnas_categoricas:
    categorias = dataset[col].unique()
    print(f"Columna: {col}")
    print(f"Categorías únicas: {categorias}")
    print(f"Número de categorías: {len(categorias)}\n")


Columna: VisitFrequency
Categorías únicas: ['Weekly' 'Rarely' 'Monthly' 'Daily']
Número de categorías: 4

Columna: DiningOccasion
Categorías únicas: ['Celebration' 'Casual' 'Business']
Número de categorías: 3

Columna: MealType
Categorías únicas: ['Dine-in' 'Takeaway']
Número de categorías: 2



Codificación de la variable Meal Type usando OHE

In [11]:
# Usar pd.get_dummies() directamente y convertir a entero
dataset['MealType'] = pd.get_dummies(dataset['MealType'], drop_first=True).astype(int)

Codificacion de las variables VisitFrequency y DiningOccasion usando Frecuency Encoder

In [12]:
# Aplicar value_counts() y map()
visit_freq_counts = dataset['VisitFrequency'].value_counts()
dataset['VisitFrequency'] = dataset['VisitFrequency'].map(visit_freq_counts)

dining_occ_counts = dataset['DiningOccasion'].value_counts()
dataset['DiningOccasion'] = dataset['DiningOccasion'].map(dining_occ_counts)

In [13]:
dataset.head()

Unnamed: 0,Income,VisitFrequency,GroupSize,DiningOccasion,MealType,OnlineReservation,DeliveryOrder,LoyaltyProgramMember,WaitTime,ServiceRating,FoodRating,AmbianceRating,HighSatisfaction
0,138842,906,8,728,0,1,0,1,4.228618,1,5,3,1
1,98671,306,4,545,1,0,0,0,50.247186,1,2,2,0
2,122351,411,7,545,1,0,1,0,10.174873,3,3,5,0
3,78868,906,4,545,1,0,0,1,14.237746,4,1,4,0
4,128686,411,8,545,0,0,0,0,56.319628,4,3,4,0


### 5. Guardar el dataset procesado

In [14]:
dataset.to_csv('../data/processed/features_for_model.csv', index=False)
#dataset.to_csv('features_for_model.csv', index=False)

Guardamos valores de configuración del train

In [15]:
feature_eng_configs = {
    'visit_freq_counts': visit_freq_counts,
    'dining_occ_counts': dining_occ_counts
}

import pickle
with open('../artifacts/feature_eng_configs.pkl', 'wb') as f:
#with open('feature_eng_configs.pkl', 'wb') as f:
    pickle.dump(feature_eng_configs, f)

