# Group 16


- Ana Margarida Valente, 20240936
- Catarina Carneiro, 20240690
- Rui Reis, 20240854
- Mara Mesquita, 20241039

Add indice

Add descriprion of the project

# 1. Import

## 1.1 Import Libraries

In [None]:
import sqlite3
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

from itertools import product
from scipy.stats import skewnorm

from datetime import datetime
from sklearn.impute import KNNImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


# for better resolution plots
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'

# Setting seaborn style
sns.set()

# Display all the df
pd.options.display.max_columns = None


## 1.2 Import Data

In [None]:
# Read the CSV file
df = pd.read_csv(r"C:\Users\anama\OneDrive\Ambiente de Trabalho\Mestrado\Projects\DM2425_ABCDEats_DATASET.csv")

In [None]:
#Create a backup of the df
df_backup=df

## Variables:
- customer_id: Unique identifier for each customer.
- customer_region: Geographic region where the customer is located.
- customer_age: Age of the customer.
- vendor_count: Number of unique vendors the customer has ordered from.
- product_count: Total number of products the customer has ordered.
- is_chain: Indicates whether the customer’s order was from a chain restaurant.
- first_order: Number of days from the start of the dataset when the customer first placed an order.
- last_order: Number of days from the start of the dataset when the customer most recently placed an order.
- last_promo: The category of the promotion or discount most recently used by the customer.
- payment_method: Method most recently used by the customer to pay for their orders.
- CUI_American,CUI_Asian,CUI_Chinese,CUI_Italian, etc.: The amount in monetary units spent by the customer from the indicated type of cuisine.
- DOW_0 to DOW_6: Number of orders placed on each day of the week (0 = Sunday, 6 = Saturday).
- HR_0 to HR_23: Number of orders placed during each hour of the day (0 = midnight, 23 = 11 PM).


# 2. Explore the Data

In [None]:
df.shape

In [None]:
# Display the first few rows of the dataframe
df.head()

In [None]:
#Check columns
df.columns.values

## Minor changes to the Data
1) Change the DOW columns to the days of the week names

2) Create new variables
  - Time Periods
  - Age Group


In [None]:
#1)
df= df.rename(columns={'DOW_0':'Sunday', 'DOW_1':'Monday', 'DOW_2':'Tuesday', 'DOW_3':'Wednesday', 'DOW_4':'Thursday', 'DOW_5':'Friday','DOW_6':'Saturday'})

In [None]:
#2
df['early_morning(0h-5h)'] = df.filter(regex=r'^HR_[0-5]$').sum(axis=1).astype(int)

df['morning(6h-11h)'] = df.filter(regex=r'^HR_([6-9]|1[0-1])$').sum(axis=1).astype(int)

df['afternoon(12h-17h)']= df.filter(regex=r'^HR_1[2-7]$').sum(axis=1).astype(int)

df['night(18h-23h)'] = df.filter(regex=r'^HR_(1[8-9]|2[0-3])$').sum(axis=1).astype(int)


In [None]:
#2
age_labels = ['Teenagers (15-19)', 'Young Adults (20-29)', 'Adults (30-49)', 'Middle-aged (50-64)', 'Seniors (65-80)']
df['age_group'] = pd.cut(df['customer_age'], bins=[15, 20, 30, 50, 65, 80], labels=age_labels, right=False)

## Data Analysis

### Data types:
- customer_age -> float? (change to int)
- first_order -> float? (change to int or date time (days) ?)
- last_order (change to date time (days) ?)
- HR_0 -> float? (change to int)

In [None]:
#Create a df only with the original variables (Drop the new columns created)
df_original = df.drop(columns=['early_morning(0h-5h)','morning(6h-11h)','afternoon(12h-17h)','night(18h-23h)','age_group'])

In [None]:
# Check data types
df_original.info()

### 
Fix data types:

In [None]:
df['customer_age'] = df['customer_age'].astype('Int64')
df['first_order'] = df['first_order'].astype('Int64')
df['HR_0'] = df['HR_0'].astype('Int64')

In [None]:
#Check only the original df = Categorical Variables
df_original.describe(include="O").T

In [None]:
#Check only the original df = Numerical Variables
df_original.describe(include=np.number).T

## Check for missing values:

In [None]:
df.isna().any()

In [None]:
df.isna().sum()

In [None]:
#Check the % of the missing values:
missing_percentage = (df.isnull().mean() * 100).sort_values(ascending=False)

print("Percentage of Missing Values:")
print(missing_percentage)


Missing Values : HR_0

In [None]:
#Check when HR_0 = NaN, which is the variable with most missing values
nan_HR_0 = df[df['HR_0'].isna()]
pd.set_option('display.max_columns', None)
nan_HR_0

In [None]:
#Define the columns of the DOW and the HR columns
dow_columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hr_columns = [col for col in df.columns if col.startswith('HR_')]

In [None]:
#To replace the NaN's of HR_0, let's compare the sum of orders of the DOW with the sum of orders of the HR:
#If it's the same, then HR_0 should be 0, if not, it's the difference between the 2 values, since the sum should be equal

row_sum_dow = df[dow_columns].sum(axis=1)
row_sum_hr = df[hr_columns].sum(axis=1)

row_difference = row_sum_dow - row_sum_hr

df.loc[df['HR_0'].isna(), 'HR_0'] = row_difference

df['HR_0'].head()


In [None]:
df['HR_0'].isna().sum()

In [None]:
#Check if the sum of the DOW is equal to the sum of the Hours; It must be
check = (df[dow_columns].sum(axis=1) == df[hr_columns].sum(axis=1)).all()

if check:
    print("Yes")
else:
    print("No")


Missing Values : first_order

In [None]:
#Check when first_order = NaN
nan_first_order = df[df['first_order'].isna()]
nan_first_order

It seem that when the first_order is a missing value, the last_order = 0.

In [None]:
df[df['first_order'].isna() & (df_original['last_order'] == 0)]

In [None]:
#First we are checking to see if in this situation, there was only one order placed
check = (df[df['first_order'].isna() & (df['last_order'] == 0)][dow_columns].sum(axis=1) == 1).all()

if check:
    print("All rows have row_sum_dow equal to 1 (indicating only one order).")
else:
    print("There are rows where row_sum_dow is not 1.")

In [None]:
#Check which rows do not meet the condition
non_matching_rows = df[(df['first_order'].isna() & (df['last_order'] == 0)) & (df[dow_columns].sum(axis=1) != 1)]

non_matching_rows

There are only 2 cases that do not meet the condition. Both cases show that 2 orders were placed on the same day (Saturday). 

Based on this previous analysis, we will assume that when first_order is missing it should be replaced with 0, ensuring that both first_order and last_order occur on the same day (the day the dataset begins).

In [None]:
#first_order is missing only when last_order = 0
#first_order cannot happen after last_order. So we will set the missing first_order values to 0
df.loc[df['first_order'].isna() & (df['last_order'] == 0), 'first_order'] = 0


Missing Values : customer_age

In [None]:
#Check when customer_age = NaN
nan_customer_age = df[df['customer_age'].isna()]
nan_customer_age

#Maybe replace the missing values with the mean or median

In [None]:
plt.hist(df['customer_age'].dropna(), bins=30, edgecolor='black')  
plt.title('Histogram of Age')
plt.xlabel('Age') 
plt.ylabel('Frequency') 
plt.show()

In [None]:
#Since the histogram is skewed, the median is preferred to replace the missing values
median_age = df['customer_age'].median()
df['customer_age'] = df['customer_age'].fillna(median_age)
print(f"Median = {median_age}")

Now, to handle the missing values in the age_group column caused by missing customer_age, we need to replace them with the category "Young Adults (20-29)", since the median (26) falls into this range.

In [None]:
df['age_group'] = df['age_group'].fillna("Young Adults (20-29)")

## Check for Duplicates

In [None]:
df.duplicated().sum()

In [None]:
df.loc[df_original.duplicated(keep=False)]

In [None]:
#% of duplicates:
df.duplicated().mean()*100

In [None]:
#Identify customer_id duplicated (since it should be a unique value, representing 1 customer)
duplicate_values = df['customer_id'].value_counts()[df['customer_id'].value_counts() > 1]

duplicate_rows = df[df['customer_id'].isin(duplicate_values.index)]

print("\n")
print(f'Total: {len(duplicate_rows)}') #drop it?
duplicate_rows

The duplicates are only the cases where the customer_id is duplicated, meaning that are 2 entries of the same customer in the dataset

In [None]:
#Drop the duplicates, since it's a very small amount????
df.drop_duplicates(inplace=True)

In [None]:
#Drop Customer_id
df = df.drop('customer_id', axis=1)

## Check for unique and strange values:

Vendor_count

In [None]:
df['vendor_count'].unique()

In [None]:
df[df['vendor_count'] == 41]

Product_count

In [None]:
df['product_count'].unique()

In [None]:
df[df['product_count'] == 269]

Region

In [None]:
df['customer_region'].value_counts()

In [None]:
(len(df[df['customer_region'] == '-']) / len(df))*100

In [None]:
# Replace '-' with the mode
mode_value = df['customer_region'].mode()[0]  

df['customer_region'] = df['customer_region'].replace('-', mode_value)


- '-' -> Strange = 1,386%; Maybe use mode or drop?
- There are 3 cities, so should we aggregate the cities by the first number of the region? (2,4,8)
- Another possibilitie is to aggregate the cities based on the frequency of customers. (1 (>5000) - 8670, 4660, 2360; 2 (<5000 & >1000) - 2440, 4140; 3 (<1000) - 8370, 2490, -, 8550 )
- (issue to think about and address in the future, 1 option keeps the distribution balanced and the other does not)

 Age

In [None]:
df['customer_age'].unique()

In [None]:
df[(df['customer_age'] == 15) | (df['customer_age'] == 16)| (df['customer_age'] == 17)]
#It could be a problem since it's a minor

Promotion

In [None]:
df['last_promo'].value_counts()

- '-' -> Changing to 'NO PROMO', to be more perceptible

In [None]:
df['last_promo'] = df['last_promo'].replace('-', 'N0 PROMO')

Payment Method

In [None]:
df['payment_method'].value_counts()

First Order

In [None]:
df['first_order'].unique()

In [None]:
df['first_order'].max()
#Makes sense because the dataset is from a three-month period

Last Order

In [None]:
df['last_order'].unique()

In [None]:
df['first_order'].max()

Is Chain

- This variable needs to be fixed. The metadata does not correspond to the dataset

- Make it binary (correspondant to the metadata); Change the info on the metadata to be coherent to the dataset

In [None]:
df['is_chain'].unique()

In [None]:
# Most of the orders are on bevarages
df[df['is_chain'] == 83]

Change is_chain to Binary Type

In [None]:
threshold = 0
df['is_chain'] = (df['is_chain'] > threshold).astype(int)

DOW 

In [None]:
for column in dow_columns:
    unique_values = df[column].unique()  # Get unique values
    print(f"Column: {column}")
    print(f"Unique Values: {unique_values}")
    print("-" * 70)

Hours

In [None]:
for column in hr_columns:
    unique_values = df[column].unique()  # Get unique values
    print(f"Column: {column}")
    print(f"Unique Values: {unique_values}")
    print("-" * 70)

In [None]:
df[df['HR_8'] == 52]

Cuisine Types

In [None]:
for col in df.columns:
    if col.startswith('CUI_'):
        unique_values = df[col].unique()  
        print(f"Column: {col}")
        print(f"Unique Values: {unique_values}")
        print("-" * 70)

# 3. Feature Understanding

Define groups of columns and create a new feature = Sum_of_Orders

In [None]:
#Define groups of columns of the same category
dow_columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hr_columns = [col for col in df.columns if col.startswith('HR_')]
CUI_columns = [col for col in df.columns if col.startswith('CUI_')]
time_columns = ['early_morning(0h-5h)','morning(6h-11h)','afternoon(12h-17h)', 'night(18h-23h)']

#Define the sum of the columns
DOW_counts = df[dow_columns].sum()
HR_counts = df[hr_columns].sum()
CUI_counts = df[CUI_columns].sum()
time_counts = df[time_columns].sum()

#Define a new Feature that contains the sum of orders by customer (it has the same info as DOW_counts and HR_counts)
df['Sum_of_Orders'] = df[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']].sum(axis=1)


Check comparation between First Order and Last Order

In [None]:
result = df[df['last_order'] < df['first_order']]
print(result)
#All good, since it wouldn't make sense if there was a last order before a first order

 Define Numerical and Categorical Features

In [None]:
numerical_features=['customer_age', 'vendor_count','product_count', 'first_order', 'last_order']
categorical_features=['customer_region','last_promo','payment_method','age_group', 'is_chain']

## Numerical Features

In [None]:
def analyze_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_lim = Q1 - 1.5 * IQR
    upper_lim = Q3 + 1.5 * IQR
    outliers = data[(data < lower_lim) | (data > upper_lim)]
    percentage = (len(outliers) / len(data)) * 100  
    return len(outliers), percentage, lower_lim, upper_lim

In [None]:
for col in numerical_features:
    print(f" Statistics for column: {col}")
    
    # Calculate key statistics
    mean = df[col].mean()
    median = df[col].median()
    std_dev = df[col].std()
    min_val = df[col].min()
    max_val = df[col].max()
    skewness = df[col].skew()
    kurtosis = df[col].kurt()

    # Display the statistics
    print(f'  Mean: {mean:.2f}')
    print(f'  Median: {median:.2f}')
    print(f'  Standard Deviation: {std_dev:.2f}')
    print(f'  Min: {min_val}')
    print(f'  Max: {max_val}')
    print(f'  Skewness: {skewness:.2f}')
    print(f'  Kurtosis: {kurtosis:.2f}')
    print('-' * 50 )
    
    # Visualization of each Variable:
    plt.figure(figsize=(12, 5))

    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Histogram of {col}')

    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')

    # Analyze outliers for the numerical variables
    outlier_count, outlier_percentage, lower_lim, upper_lim = analyze_outliers(df[col])
  
    print(f'Count of outliers: {outlier_count}')
    print(f'Percentage of outliers: {outlier_percentage:.2f}%')
    print(f'Lower Lim:{lower_lim}')
    print(f'Upper Lim:{upper_lim}')
    print('-' * 40)
   
    plt.tight_layout()
    plt.show()

In [None]:
hr_dow_cui=['CUI_American', 'CUI_Asian', 'CUI_Beverages','CUI_Cafe', 'CUI_Chicken Dishes', 
            'CUI_Chinese', 'CUI_Desserts','CUI_Healthy', 'CUI_Indian', 'CUI_Italian', 
            'CUI_Japanese','CUI_Noodle Dishes', 'CUI_OTHER', 'CUI_Street Food / Snacks','CUI_Thai', 
            'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday', 
            'HR_0', 'HR_1', 'HR_2', 'HR_3', 'HR_4', 'HR_5','HR_6', 'HR_7', 'HR_8', 'HR_9', 'HR_10', 
            'HR_11', 'HR_12', 'HR_13','HR_14', 'HR_15', 'HR_16', 'HR_17', 'HR_18', 'HR_19', 'HR_20', 'HR_21','HR_22', 'HR_23']

for col in hr_dow_cui:
    print(f" Statistics for column: {col}")
    
    # Calculate key statistics
    mean = df[col].mean()
    median = df[col].median()
    std_dev = df[col].std()
    min_val = df[col].min()
    max_val = df[col].max()
    skewness = df[col].skew()
    kurtosis = df[col].kurt()

    # Display the statistics
    print(f'  Mean: {mean:.2f}')
    print(f'  Median: {median:.2f}')
    print(f'  Standard Deviation: {std_dev:.2f}')
    print(f'  Min: {min_val}')
    print(f'  Max: {max_val}')
    print(f'  Skewness: {skewness:.2f}')
    print(f'  Kurtosis: {kurtosis:.2f}')
    print('-' * 50 )
    
    # Visualization of each Variable:
    plt.figure(figsize=(12, 5))

    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Histogram of {col}')

    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')

    # Analyze outliers for the numerical variables
    outlier_count, outlier_percentage, lower_lim, upper_lim = analyze_outliers(df[col])
  
    print(f'Count of outliers: {outlier_count}')
    print(f'Percentage of outliers: {outlier_percentage:.2f}%')
    print(f'Lower Lim:{lower_lim}')
    print(f'Upper Lim:{upper_lim}')
    print('-' * 40)
   
    plt.tight_layout()
    plt.show()

In [None]:
plt.figure(figsize=(12, 6))
DOW_counts.plot(kind='bar', 
                color='lightsteelblue', 
                edgecolor='black')

plt.title('Number of Orders for Each Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Orders')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
HR_counts.plot(kind='bar', 
               color='lightsteelblue', 
               edgecolor='black')

plt.title('Number of Orders for Each Hour')
plt.xlabel('Hours')
plt.ylabel('Number of Orders')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
time_counts.plot(kind='bar', 
                 color='lightsteelblue', 
                 edgecolor='black')

plt.title('Number of Orders for Time Period')
plt.xlabel('Time Period')
plt.ylabel('Number of Ordes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
CUI_counts_sorted = CUI_counts.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
CUI_counts_sorted.plot(kind='bar', 
                       color='lightsteelblue', 
                       edgecolor='black')

plt.title('Expenses for each Type of Cuisine')
plt.xlabel('Type of Cuisine')
plt.ylabel('Expenses')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

###
 Pairwise relationships between Numerical Variables


In [None]:
sns.pairplot(df, vars= numerical_features)


In [None]:
sns.regplot(x=df['vendor_count'], y=df['product_count'])
plt.show()


## Categorical Features

In [None]:
for col in categorical_features:
    print(f"Analysis for categorical column: {col}")
    
    # Calculate frequency counts
    freq_counts = df[col].value_counts()
    
    # Display the frequency counts
    print("Frequency counts:")
    print(freq_counts)
    print("-"*50 )
    
    # Visualization for categorical variables:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=df[col], palette='viridis')
    plt.title(f'Count Plot of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

In [None]:
plt.figure(figsize=(8, 6))
payment_method_pie = df['payment_method'].value_counts() \
    .plot (kind='pie', 
           title='Payment Method', 
           autopct='%1.1f%%',
           colors=sns.color_palette("viridis"),
           figsize=(5, 4), 
           ylabel="")

In [None]:
threshold = 0.05 * df['customer_region'].value_counts().sum()

customer_region_counts = df['customer_region'].value_counts()

others = customer_region_counts[customer_region_counts < threshold].sum()

customer_region_counts = customer_region_counts[customer_region_counts >= threshold]
customer_region_counts['Others'] = others

plt.figure(figsize=(8, 6))
customer_region = customer_region_counts \
    .plot (kind='pie', 
           title='Customer Region', 
           autopct='%1.1f%%',
           colors=sns.color_palette("viridis"),
           figsize=(5, 4), 
           ylabel="")


In [None]:
plt.figure(figsize=(8, 6))

#promo_df = df['last_promo'].apply(lambda x: 'NO PROMO' if(x=='-') else x  )
promo_category = df['last_promo'].value_counts() \
    .plot (kind='pie', 
           title='Last Category of promotion or discount', 
           autopct='%1.1f%%',
           colors=sns.color_palette("viridis"),
           figsize=(5, 4), 
           ylabel="")


# 4. Feature Relationships

## New Features

Recency

In [None]:
# Determine the maximum number of days (most recent day in dataset)
max_days = df['last_order'].max()

# Calculate recency
df['recency'] = max_days - df['last_order']

Frequency

In [None]:
# Calculate active period
df['active_period'] = df['last_order'] - df['first_order'] + 1

# Calculate frequency
df['frequency'] = df['Sum_of_Orders'] / df['active_period']

RFM 
- Recency = 'recency'
- Frequency = 'frequency'
- moentary = 'total spend'

####
We decided to create a new variable called cuisine_diversity to measure the variety of cuisines each customer orders from. This variable will help us analyze which age groups or regions tend to explore a wider range of cuisines, indicating openness to new experiences. Conversely, it will allow us to identify customers who stick to fewer options, showing a strong preference for specific types of cuisine

In [None]:
# Cuisine diversity (number of different cuisines ordered)
df['cuisine_diversity'] = (df[CUI_columns] > 0).sum(axis=1)

In [None]:
col='cuisine_diversity'
# Calculate key statistics
mean = df[col].mean()
median = df[col].median()
std_dev = df[col].std()
min_val = df[col].min()
max_val = df[col].max()
skewness = df[col].skew()
kurtosis = df[col].kurt()

# Display the statistics
print(f'  Mean: {mean:.2f}')
print(f'  Median: {median:.2f}')
print(f'  Standard Deviation: {std_dev:.2f}')
print(f'  Min: {min_val}')
print(f'  Max: {max_val}')
print(f'  Skewness: {skewness:.2f}')
print(f'  Kurtosis: {kurtosis:.2f}')
print('-' * 50 )
    
# Visualization of each Variable
plt.figure(figsize=(12, 5))

# Histogram
plt.subplot(1, 2, 1)
sns.histplot(df[col], bins=10)
plt.title(f'Histogram of {col}')

# Boxplot
plt.subplot(1, 2, 2)
sns.boxplot(y=df[col])
plt.title(f'Boxplot of {col}')

# Analyze outliers for the specified numerical variables
outlier_count, outlier_percentage, lower_lim, upper_lim = analyze_outliers(df[col])
  
print(f'Count of outliers: {outlier_count}')
print(f'Percentage of outliers: {outlier_percentage:.2f}%')
print(f'Lower Lim:{lower_lim}')
print(f'Upper Lim:{upper_lim}')
print('-' * 40)
   
plt.tight_layout()
plt.show()

####
We decided to create a new variable called total_spend to measure the overall spending of customers across different age groups and regions. This variable will help us analyze which age groups and regions have a higher capacity and willingness to spend on food orders through the app.

In [None]:
# Total spend per customer
df['total_spend'] = df[CUI_columns].sum(axis=1)

In [None]:
col='total_spend'
# Calculate key statistics
mean = df[col].mean()
median = df[col].median()
std_dev = df[col].std()
min_val = df[col].min()
max_val = df[col].max()
skewness = df[col].skew()
kurtosis = df[col].kurt()

# Display the statistics
print(f'  Mean: {mean:.2f}')
print(f'  Median: {median:.2f}')
print(f'  Standard Deviation: {std_dev:.2f}')
print(f'  Min: {min_val}')
print(f'  Max: {max_val}')
print(f'  Skewness: {skewness:.2f}')
print(f'  Kurtosis: {kurtosis:.2f}')
print('-' * 50 )
    
# Visualization of each Variable
plt.figure(figsize=(12, 5))

# Histogram
plt.subplot(1, 2, 1)
sns.histplot(df[col], bins=10, log=True)
plt.ylabel('Log10(Count)')
plt.title(f'Histogram of {col}')

# Boxplot
plt.subplot(1, 2, 2)
sns.boxplot(y=df[col])
plt.title(f'Boxplot of {col}')

# Analyze outliers for the specified numerical variables
outlier_count, outlier_percentage, lower_lim, upper_lim = analyze_outliers(df[col])
  
print(f'Count of outliers: {outlier_count}')
print(f'Percentage of outliers: {outlier_percentage:.2f}%')
print(f'Lower Lim:{lower_lim}')
print(f'Upper Lim:{upper_lim}')
print('-' * 40)
   
plt.tight_layout()
plt.show()


Analyze the Sum_of_Orders Variable, creating in the beginning, that indicates the total orders by customer. 

In [None]:
col='Sum_of_Orders'
# Calculate key statistics
mean = df[col].mean()
median = df[col].median()
std_dev = df[col].std()
min_val = df[col].min()
max_val = df[col].max()
skewness = df[col].skew()
kurtosis = df[col].kurt()

# Display the statistics
print(f'  Mean: {mean:.2f}')
print(f'  Median: {median:.2f}')
print(f'  Standard Deviation: {std_dev:.2f}')
print(f'  Min: {min_val}')
print(f'  Max: {max_val}')
print(f'  Skewness: {skewness:.2f}')
print(f'  Kurtosis: {kurtosis:.2f}')
print('-' * 50 )
    
# Visualization of each Variable
plt.figure(figsize=(12, 5))

# Histogram
plt.subplot(1, 2, 1)
sns.histplot(df[col], bins=10, log=True)
plt.ylabel('Log10(Count)')
plt.title(f'Histogram of {col}')

# Boxplot
plt.subplot(1, 2, 2)
sns.boxplot(y=df[col])
plt.title(f'Boxplot of {col}')

# Analyze outliers for the specified numerical variables
outlier_count, outlier_percentage, lower_lim, upper_lim = analyze_outliers(df[col])
  
print(f'Count of outliers: {outlier_count}')
print(f'Percentage of outliers: {outlier_percentage:.2f}%')
print(f'Lower Lim:{lower_lim}')
print(f'Upper Lim:{upper_lim}')
print('-' * 40)
   
plt.tight_layout()
plt.show()

City

Aggregate Regions by the First Digit, which indicates the City

In [None]:
# Creating new Feature customer_city dividied into categories 
def categorize_city(customer_region):
    if pd.isna(customer_region):  # Check if the value is NaN
        return np.nan 
    elif customer_region== "-":
        return "Other"
    elif customer_region[0].isdigit():
        return customer_region[0]
    else:
        return "Other"


# Apply the function to create the new 'customer_city' column
df['customer_city'] = df['customer_region'].apply(categorize_city)

print(df['customer_city'].value_counts(dropna=False))

In [None]:
# Create the count plot
plt.figure(figsize=(8,6))
sns.countplot(x='customer_city', data=df)

# Add titles and labels
plt.title('City', fontsize=14)
plt.xlabel('customer_city', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Show the plot
plt.show()

Create new variables: Weekdays and Weekends

In [None]:
df['Weekdays'] = df[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']].sum(axis=1)
df['Weekends'] = df[['Saturday', 'Sunday']].sum(axis=1)

## Correlations and Heatmaps

In [None]:
df_corr = df[['customer_age', 
              'vendor_count',
               'product_count', 
              'is_chain', 
              'first_order', 
              'last_order'
             ]].corr()
df_corr

In [None]:
sns.heatmap(df_corr, 
            annot=True, 
            cmap='PiYG')

plt.title('Correlation Heatmap between numerical variables', fontsize=12)

- product_count and vendor_count have a very high correlation (0,83)
- product_count and is_chain have a very high correlation (0,83)

In [None]:
df_corr_all=df[['customer_age', 'vendor_count',
       'product_count', 'first_order', 'last_order', 'CUI_American', 'CUI_Asian', 'CUI_Beverages',
       'CUI_Cafe', 'CUI_Chicken Dishes', 'CUI_Chinese', 'CUI_Desserts',
       'CUI_Healthy', 'CUI_Indian', 'CUI_Italian', 'CUI_Japanese',
       'CUI_Noodle Dishes', 'CUI_OTHER', 'CUI_Street Food / Snacks',
       'CUI_Thai', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
       'Saturday', 'Sunday', 'HR_0', 'HR_1', 'HR_2', 'HR_3', 'HR_4', 'HR_5',
       'HR_6', 'HR_7', 'HR_8', 'HR_9', 'HR_10', 'HR_11', 'HR_12', 'HR_13',
       'HR_14', 'HR_15', 'HR_16', 'HR_17', 'HR_18', 'HR_19', 'HR_20', 'HR_21',
       'HR_22', 'HR_23']].corr()

plt.figure(figsize=(20, 20)) 


sns.heatmap(df_corr_all, annot=True,
            linewidths=0.7, vmin=-1, vmax=1, square=True,
            cbar_kws={'shrink': 0.75, 'aspect': 30}, 
            annot_kws={'size': 6 },  
            cmap='PiYG')  

plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)

plt.title('Correlation Heatmap with all numeric variables', fontsize=16, weight='bold')

plt.show()

In [None]:
# Create an empty DataFrame to store the aggregated activity, initializing with zeros
heatmap_data = pd.DataFrame(index=dow_columns, columns=hr_columns)

# Sum of the hourly activity for each day and fill in the heatmap data
for day in dow_columns:
    # Summing the hourly columns for the current day and filling NaNs with zero
    heatmap_data.loc[day] = df.loc[df[day]  > 0, hr_columns].sum().fillna(0)

# Convert all data to numeric (float)
heatmap_data = heatmap_data.astype(float)

# Plot the heatmap
plt.figure(figsize=(30, 15))
sns.heatmap(heatmap_data, 
            cmap='PiYG', 
            linewidths=1, 
            annot=True, 
            square=True, 
            fmt='.0f')
plt.title('Heatmap of Hourly Activity Throughout the Week')
plt.xlabel('Hour of the Day (0-23)')
plt.ylabel('Day of the Week')
plt.show()


In [None]:
# Create an empty DataFrame to store the aggregated activity, initializing with zeros
heatmap_data = pd.DataFrame(index=dow_columns, columns=time_columns)

# Sum the hourly activity for each day and fill in the heatmap data
for day in dow_columns:
    # Summing the hourly columns for the current day and filling NaNs with zero
    heatmap_data.loc[day] = df.loc[df[day] > 0, time_columns].sum().fillna(0)

# Convert all data to numeric (float)
heatmap_data = heatmap_data.astype(float)

# Plot the heatmap
plt.figure(figsize=(6, 6))
sns.heatmap(heatmap_data, 
            cmap='PiYG', 
            linewidths=0.5, 
            annot=True, 
            square=True,
            annot_kws={'size': 10 }, 
            fmt='.0f')
plt.title('Heatmap of Period of Time Activity Throughout the Week')
plt.xlabel('Period of Time')
plt.ylabel('Day of the Week')
plt.show()

In [None]:
cuisine_by_age = df.groupby('age_group')[CUI_columns].sum()

plt.figure(figsize=(12, 8))
sns.heatmap(cuisine_by_age, 
            annot=True,
            annot_kws={'size': 10 }, 
            cmap='PiYG', 
            fmt='.0f', 
            square=True)

plt.title('Average Spend on each Cuisine by Age', fontsize=16)
plt.xlabel('Cuisine Type', fontsize=12)
plt.ylabel('Age Group', fontsize=12)


plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()


In [None]:
# Group by region and sum the DOW
cuisine_by_region = df.groupby('customer_region')[dow_columns].sum()  
plt.figure(figsize=(10, 6))
sns.heatmap(cuisine_by_region, 
            annot=True,
            annot_kws={'size': 10}, 
            cmap='PiYG', 
            fmt='.0f', 
            square=True)

plt.title('DOW Activity by Region', fontsize=16)
plt.xlabel('DOW', fontsize=12)  
plt.ylabel('Region', fontsize=12)

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Multivariate Analysis

Payment Method

In [None]:
payment_counts = df.groupby(['age_group', 'payment_method']).size().unstack(fill_value=0)

colors=['#87CEEB','#00BFFF','#4682B4']
payment_counts.plot(kind='bar', 
                    stacked=False, 
                    figsize=(10, 6), 
                    color=colors, 
                    edgecolor='black') 

plt.title('Payment Methods by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Number of orders')
plt.xticks(rotation=0)  
plt.legend(title='Payment Method')
plt.tight_layout()  
plt.show()

In [None]:
payment_counts_region = df.groupby(['customer_region', 'payment_method']).size().unstack(fill_value=0)

colors=['#87CEEB','#00BFFF','#4682B4']
payment_counts_region.plot(kind='bar', 
                    stacked=False, 
                    figsize=(10, 6), 
                    color=colors, 
                    edgecolor='black') 

plt.title('Payment Methods by Region')
plt.xlabel('Region')
plt.ylabel('Number of orders')
plt.xticks(rotation=0)  
plt.legend(title='Payment Method')
plt.tight_layout()  
plt.show()

Age Group

In [None]:
# Grouping data by region and age group
age_counts = df.groupby(['customer_region', 'age_group']).size().reset_index(name='count')

# Creating a pivot table
pivot_age_counts = age_counts.pivot(index='customer_region', columns='age_group', values='count').fillna(0)

# Plotting the stacked bar chart
pivot_age_counts.plot(kind='bar', 
                      stacked=True, 
                      figsize=(10, 6), 
                      color=plt.cm.tab20.colors) # Ensuring the color palette is correct

plt.title('Distribution of Age Groups by Region')
plt.xlabel('Region')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)  
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')  
plt.tight_layout()  
plt.show()

In [None]:
# Grouping data by city and age group
age_counts_city = df.groupby(['customer_city', 'age_group']).size().reset_index(name='count')

# Create the stacked bar plot
age_counts_city_pivot = age_counts_city.pivot(index='customer_city', columns='age_group', values='count').fillna(0)

age_counts_city_pivot.plot(kind='bar', 
                            stacked=True, 
                            figsize=(10, 6),
                            color=plt.cm.tab20.colors) # Ensuring the color palette is correct
plt.title('Distribution of Age Groups by City')
plt.xlabel('City')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)  
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')  
plt.tight_layout()  
plt.show()

Cuisine Type

In [None]:
cuisine_counts = df.groupby('customer_region')[CUI_columns].sum().reset_index()

cuisine_counts_ = cuisine_counts.melt(id_vars='customer_region', 
                                            value_vars=CUI_columns, 
                                            var_name='cuisine_type', 
                                            value_name='total_expenditure')

top_cuisines = (
    cuisine_counts_.groupby('customer_region')
    .apply(lambda x: x.nlargest(3, 'total_expenditure'))
    .reset_index(drop=True)
)


top_cuisine_counts = top_cuisines.pivot(index='customer_region', 
                                        columns='cuisine_type', 
                                        values='total_expenditure').fillna(0)


num_cuisines = top_cuisine_counts.shape[1]
colors = plt.cm.Paired(np.linspace(0, 1, num_cuisines)) 


top_cuisine_counts.plot(kind='bar', 
                        stacked=True, 
                        figsize=(10, 6), 
                        color=colors)

plt.title('Top 3 Cuisine Type by Region')
plt.xlabel('Region')
plt.ylabel('Total Expenditure')
plt.xticks(rotation=45)
plt.legend(title='Type of Cuisine', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
cuisine_counts = df.groupby('customer_city')[CUI_columns].sum().reset_index()

cuisine_counts_1= cuisine_counts.melt(id_vars='customer_city', 
                                            value_vars=CUI_columns, 
                                            var_name='cuisine_type', 
                                            value_name='total_expenditure')

top_cuisines = (
    cuisine_counts_1.groupby('customer_city')
    .apply(lambda x: x.nlargest(3, 'total_expenditure'))
    .reset_index(drop=True)
)


top_cuisine_counts = top_cuisines.pivot(index='customer_city', 
                                        columns='cuisine_type', 
                                        values='total_expenditure').fillna(0)


num_cuisines = top_cuisine_counts.shape[1]
colors = plt.cm.Paired(np.linspace(0, 1, num_cuisines)) 


top_cuisine_counts.plot(kind='bar', 
                        stacked=True, 
                        figsize=(10, 6), 
                        color=colors)

plt.title('Top 3 Cuisine Type by City')
plt.xlabel('City')
plt.ylabel('Total Expenditure')
plt.xticks(rotation=45)
plt.legend(title='Type of Cuisine', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


Total Spending

In [None]:
# Group by region and sum the total spending
region_spend = df.groupby('customer_region')['total_spend'].sum().reset_index()

region_spend = region_spend.sort_values(by='total_spend', ascending=False)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='customer_region', y='total_spend', data=region_spend, palette="viridis")

plt.title("Total Spending per Region")
plt.xlabel("Customer Region")
plt.ylabel("Total Spend")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Group by age group and  total spending
region_spend = df.groupby('age_group')['total_spend'].sum().reset_index()

region_spend = region_spend.sort_values(by='total_spend', ascending=False)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='age_group', y='total_spend', data=region_spend, palette="viridis")

plt.title("Total Spending per Age Group")
plt.xlabel("Age Group")
plt.ylabel("Total Spend")
plt.xticks(rotation=45)
plt.show()

Cuisine Diversity

In [None]:
# Group by region and cuisine diversity
region_spend = df.groupby('customer_region')['cuisine_diversity'].sum().reset_index()

region_spend = region_spend.sort_values(by='cuisine_diversity', ascending=False)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='customer_region', y='cuisine_diversity', data=region_spend, palette="viridis")

plt.title("Cuisine Diversity per Region")
plt.xlabel("Customer Region")
plt.ylabel("Cuisine Diveristy")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Group by age group and cuisine diversity
region_spend = df.groupby('age_group')['cuisine_diversity'].sum().reset_index()

region_spend = region_spend.sort_values(by='cuisine_diversity', ascending=False)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='age_group', y='cuisine_diversity', data=region_spend, palette="viridis")

plt.title("Cuisine Diversity per Age Group")
plt.xlabel("Age Group")
plt.ylabel("Cuisine Diversity")
plt.xticks(rotation=45)
plt.show()

3 Variables

In [None]:
df['sum_of_orders']=df[['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']].sum(axis=1)
# Group by customer_region and customer_age
grouped = df.groupby(['customer_region', 'customer_age'])['sum_of_orders'].sum().reset_index()

# Sort the results
grouped = grouped.sort_values(['customer_region', 'customer_age'])

In [None]:
regional_stats = grouped.groupby('customer_region').agg({
    'customer_age': ['mean', 'min', 'max'],
    'sum_of_orders': ['mean', 'min', 'max', 'sum']
}).reset_index()

print(regional_stats)


In [None]:
# Create a high-contrast color palette
num_regions = len(grouped['customer_region'].unique())
base_colors = plt.cm.get_cmap('Set1')(np.linspace(0, 1, num_regions))
custom_colors = ['#FF1493', '#00FFFF', '#FFD700', '#32CD32', '#FF4500', '#8A2BE2', '#00CED1']
color_palette = list(base_colors) + custom_colors

plt.figure(figsize=(10, 6))

for i, region in enumerate(grouped['customer_region'].unique()):
    region_data = grouped[grouped['customer_region'] == region]
    plt.scatter(region_data['customer_age'], region_data['sum_of_orders'], 
                label=region, alpha=0.6, color=color_palette[i])

plt.xlabel('Customer Age', fontsize=12)
plt.ylabel('Sum of Orders', fontsize=12)
plt.title('Customer Age vs Sum of Orders by Region', fontsize=16)
plt.legend(title='Region', title_fontsize='12', fontsize='10', loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
# my_colors = plt.get_cmap('tab20').colors  

# cuisine_cols = all_cols = [col for col in df.columns if col.startswith('CUI_')]
# all_cols.append("customer_region")
# all_cols.append("age_group")
# group_by_ageG_region_CUI =  df[all_cols].groupby(['customer_region','age_group']).sum()

# group_by_ageG_region_CUI = (
#     group_by_ageG_region_CUI
#     .apply(lambda x: x.nlargest(5),axis=1)
    
# )

# high_contrast_colors = [
#     "#000000",  # Black
#     "#FF0000",  # Red
#     "#00FFFF",  # Cyan
#     "#00FF00",  # Green
#     "#FF00FF",  # Magenta
#     "#0000FF",  # Blue
#     "#FFFF00",  # Yellow
#     "#FFA500",  # Orange
#     "#800080",  # Purple
#     "#008080",  # Teal
#     "#FF6347",  # Tomato (bright red-orange)
#     "#40E0D0",  # Turquoise
#     "#8B0000",  # Dark Red
#     "#808080",  # Gray
#     "#00008B",  # Dark Blue
#     "#ADFF2F"   # Green Yellow
# ]
# rows = math.ceil(len(df["customer_region"].unique())/3)

# fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))
# axes = axes.flatten()  
# for i,region in enumerate(df["customer_region"].unique() ):
#     # Filter data for the current customer_region
#     region_data = group_by_ageG_region_CUI.loc[region]
    
#     # Plot
#     region_data.plot(kind='bar',ax=axes[i], figsize=(30, 20), width=1,color=high_contrast_colors)
#     axes[i].set_title(f'CUI by Age Group for Customer Region {region}')
#     axes[i].set_xlabel('Age Group')
#     axes[i].set_ylabel('Expenses')
#     axes[i].tick_params(axis='x', rotation=0)  # Set x-axis label rotation

#     axes[i].legend(title='CUI Type')
    
    
# # Show the plot
# for j in range(i + 1, len(axes)):
#     axes[j].axis('off')
# plt.tight_layout()
# plt.show()

## Outliers

In [None]:
combined_list_numerical = numerical_features + hr_columns + dow_columns + CUI_columns 

In [None]:
#remove outliers numerical features 
#computing the interquartile range 
q1= df[combined_list_numerical].quantile(0.25)
q3=df[combined_list_numerical].quantile(0.75)
iqr=q3-q1

#compute the limits 
lower_lim= q1-(1.5*iqr)
upper_lim=q3+(1.5*iqr)

for feature in combined_list_numerical: 
    print(f"{feature:<25} Lower Limit:{lower_lim[feature]:>10}      Upper Limit:{upper_lim[feature]:>10}")

In [None]:
def identify_outliers(df, list, lower_lim, upper_lim):
    outliers = {}
    obvious_outliers = []

    for metric in combined_list_numerical:
        if metric not in df.columns:
            continue
        
        if metric not in lower_lim.keys() or metric not in upper_lim.keys():
            continue
        
        outliers[metric] = []
        llim = lower_lim[metric]
        ulim = upper_lim[metric]
        
        for i, value in enumerate(df[metric]):
            if pd.isna(value):
                continue
            
            if value < llim or value > ulim:
                outliers[metric].append(value)
        
        print(f"Total outliers in {metric}: {len(outliers[metric])}")

    # Check for observations that are outliers in all features (Obvious Outliers)
    for index, row in df.iterrows():
        is_global_outlier = True
        for metric in combined_list_numerical:
            if metric not in df.columns or metric not in lower_lim or metric not in upper_lim:
                is_global_outlier = False
                break
            
            value = row[metric]
            if pd.isna(value):
                is_global_outlier = False
                break
            
            llim = lower_lim[metric]
            ulim = upper_lim[metric]
            
            if llim <= value <= ulim:
                is_global_outlier = False
                break
        
        if is_global_outlier:
            obvious_outliers.append(index)
    print("-----------------------------")
    print(f"Total global outliers: {len(obvious_outliers)}")
    return outliers, obvious_outliers
    
    
outliers, obvious_outliers = identify_outliers(df, combined_list_numerical, lower_lim, upper_lim)

In [None]:
combined_list_numerical_ = [col for col in combined_list_numerical if col not in ['first_order', 'last_order']]

outliers, obvious_outliers = identify_outliers(df, combined_list_numerical_, lower_lim, upper_lim)

In [None]:
# filter that will verify if an observation has every characteristic in the Interquartile Range or not 
filters_iqr = []                                            
for metric in combined_list_numerical:
    llim = lower_lim[metric]
    ulim = upper_lim[metric]
    filters_iqr.append(df[metric].between(llim, ulim, inclusive='neither'))

filters_iqr_all = pd.concat(filters_iqr, axis=1).all(axis=1)

In [None]:
filters_iqr

In [None]:
filters_iqr_all  

In [None]:
len(df[df[filters_iqr_all] == 'False'])      #These are the features that have at least one of its characteristics considered as an outlier (out of the IQR)

In [None]:
df_iqr = df[filters_iqr_all]
print('Percentage of data kept after removing outliers:', 100*(np.round(df_iqr.shape[0] / df.shape[0], decimals=2)))

In [None]:
filters_man = (
     (df['customer_age']<=50)  #justificar o 50 
    &
    (df['vendor_count']<=30)
    &
    (df['product_count']<=100)
    &
    (df['HR_0']<=9)
    &
    (df['HR_1']<=9)
    &
    (df['HR_2']<=10)  
    &
    (df['HR_3']<=10) 
    &
    (df['HR_4']<=10) 
    &
    (df['HR_5']<=5) 
    &
    (df['HR_6']<=8) 
    &
    (df['HR_7']<=9) 
    &
    (df['HR_8']<=12) 
    &
    (df['HR_9']<=11) 
    &
    (df['HR_10']<=17) 
    &
    (df['HR_11']<=17) 
    &
    (df['HR_12']<=17) 
    &
    (df['HR_13']<=12) 
    &
    (df['HR_14']<=11) 
    &
    (df['HR_15']<=10) 
    &
    (df['HR_16']<=14) 
    &
    (df['HR_17']<=15) 
    &
    (df['HR_18']<=20) 
    &
    (df['HR_19']<=16) 
    &
    (df['HR_20']<=15) 
    &
    (df['HR_21']<=6) 
    &
    (df['HR_22']<=7) 
    &
    (df['HR_23']<=6) 
    &
    (df['CUI_American']<=150)
    &
    (df['CUI_Asian']<=400)
    &
    (df['CUI_Beverages']<=150)  
    &
    (df['CUI_Cafe']<=150) 
    &
    (df['CUI_Chicken Dishes']<=75) 
    &
    (df['CUI_Chinese']<=200) 
    &
    (df['CUI_Desserts']<=130) 
    &
    (df['CUI_Healthy']<=150) 
    &
    (df['CUI_Indian']<=150) 
    &
    (df['CUI_Italian']<=200) 
    &
    (df['CUI_Japanese']<=200) 
    &
    (df['CUI_Noodle Dishes']<=100) 
    &
    (df['CUI_OTHER']<=100) 
    &
    (df['CUI_Street Food / Snacks']<=200) 
    &
    (df['CUI_Thai']<=60)
    & 
    (df['Sunday']<=12)
    &
    (df['Monday']<=13)
    &
    (df['Tuesday']<=14)  
    &
    (df['Wednesday']<=15) 
    &
    (df['Thursday']<=16) 
    &
    (df['Friday']<=15) 
    &
    (df['Saturday']<=15)  
)

df_outliers = df[filters_man]


In [None]:
print('Percentage of data kept after removing outliers:', 100*(np.round(df_outliers.shape[0] / df.shape[0], decimals=5)))

## Encoding

In [None]:
df.columns.values

In [None]:
df = df.drop(columns=['customer_region'])

In [None]:
df_ohc = df.copy()

In [None]:
categorical_features_new = ['last_promo', 'payment_method', 'age_group', 'is_chain', 'customer_city']

In [None]:
# Remove 'age_group' from the list of categorical features
columns_to_remove = ['age_group', 'last_promo']  
cf_for_ohc = [col for col in categorical_features_new if col not in columns_to_remove]

# Initialize the encoder with the chosen settings
ohc = OneHotEncoder(sparse_output=False, drop=None)

# Fit the encoder on the selected categorical features
ohc.fit(df_ohc[cf_for_ohc])

# Transform the selected categorical features
ohc_features = ohc.transform(df_ohc[cf_for_ohc])

# Create a DataFrame with encoded feature names
ohc_df = pd.DataFrame(
    ohc_features,
    index=df_ohc.index,
    columns=ohc.get_feature_names_out(cf_for_ohc)
)

# Concatenate the encoded features back to the original dataset
ohc_encoded = pd.concat(
    [df_ohc.drop(columns=cf_for_ohc), ohc_df],
    axis=1
)


In [None]:
ohc_encoded.columns.values

In [None]:
ohc_encoded.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# Define the encoder with explicit categories
ordinal_encoder = OrdinalEncoder(
    categories=[['Teenagers (15-19)', 'Young Adults (20-29)', 'Adults (30-49)', 'Middle-aged (50-64)', 'Seniors (65-80)']],
    handle_unknown='use_encoded_value',
    unknown_value=-1  # Use -1 for unknown categories
)

# Ensure the 'age_group' column exists
if 'age_group' not in df_ohc.columns:
    raise KeyError("The column 'age_group' is not in the DataFrame.")

# Fit and transform the data
try:
    df_ohc['age_group_encoded'] = ordinal_encoder.fit_transform(df_ohc[['age_group']])
except Exception as e:
    print(f"Error during encoding: {e}")

# Verify the transformed column
print(df_ohc[['age_group', 'age_group_encoded']].head())

# Concatenate with the existing DataFrame
df_encoded = pd.concat([ohc_encoded, df_ohc[['age_group_encoded']]], axis=1)



In [None]:
# Map the PROMO feature to binary values
promo_mapping = {
    'N0 PROMO': 0,  # No promo maps to 0
    'DISCOUNT': 1,   # All promos map to 1
    'DELIVERY': 1,
    'FREEBIE': 1
}

# Apply the mapping to create the binary encoded column
df_ohc['last_promo'] = df_ohc['last_promo'].map(promo_mapping)

# Concatenate the encoded features back to the original dataset
df_encoded = pd.concat([ohc_encoded, df_ohc['last_promo']], axis=1)


In [None]:
df_encoded.head()

In [None]:
df_encoded.drop(columns=['age_group'])