In [1]:
import pandas as pd
import json

# Load the order_data.csv file
try:
    order_data = pd.read_csv('order_data.csv')
except FileNotFoundError:
    print("Error: 'order_data.csv' not found. Please make sure the file is in the same directory.")
    exit()

# Function to parse the JSON and extract a list of clean item dictionaries
def extract_clean_items(orders_json_string):
    if pd.isna(orders_json_string):
        return []
    try:
        data = json.loads(orders_json_string)
        clean_items = []
        for order in data.get('orders', []):
            for item in order.get('item_details', []):
                # Filter out items with a price of 0
                if item.get('item_price', 0) > 0:
                    clean_items.append({
                        'item name': item.get('item_name'),
                        'item price': item.get('item_price'),
                        'item qty': item.get('item_quantity')
                    })
        return clean_items
    except (json.JSONDecodeError, KeyError):
        return []

# Apply the function to the 'ORDERS' column
order_data['clean_items_list'] = order_data['ORDERS'].apply(extract_clean_items)

# --- Expanding the item list into new columns ---
# We will use a more memory-efficient approach by creating a list of new columns to add
max_items = order_data['clean_items_list'].str.len().max()
new_columns = {}

for i in range(int(max_items)):
    item_prefix = f'item {i+1}'
    new_columns[f'{item_prefix} name'] = order_data['clean_items_list'].apply(lambda x: x[i]['item name'] if i < len(x) else None)
    new_columns[f'{item_prefix} price'] = order_data['clean_items_list'].apply(lambda x: x[i]['item price'] if i < len(x) else None)
    new_columns[f'{item_prefix} qty'] = order_data['clean_items_list'].apply(lambda x: x[i]['item qty'] if i < len(x) else None)

# Add the new columns to the original DataFrame
for col_name, col_data in new_columns.items():
    order_data[col_name] = col_data

# Drop the temporary 'clean_items_list' column and the original 'ORDERS' JSON column
order_data = order_data.drop(columns=['clean_items_list', 'ORDERS'])

print("DataFrame with new columns added:")
print(order_data.head())

DataFrame with new columns added:
   CUSTOMER_ID  STORE_NUMBER ORDER_CREATED_DATE    ORDER_ID  \
0    362204699          2156         2024-07-24  7247194287   
1    269612955          1419         2025-02-15   791214421   
2    585330633          2249         2025-02-15  7575285208   
3    950661333          2513         2024-03-29  4253875716   
4    434985772          1754         2024-04-08  7150407872   

  ORDER_CHANNEL_NAME ORDER_SUBCHANNEL_NAME ORDER_OCCASION_NAME  \
0            Digital                   WWT                ToGo   
1            Digital                   WWT                ToGo   
2            Digital                   WWT                ToGo   
3            Digital                   WWT                ToGo   
4            Digital                   WWT                ToGo   

                 item 1 name  item 1 price  item 1 qty  ... item 12 qty  \
0  10 pc Grilled Wings Combo         15.29         1.0  ...         NaN   
1        Ranch Dip - Regular          1.

In [2]:
# Save the final DataFrame to a new CSV file
# The 'index=False' argument prevents pandas from writing the DataFrame index to the CSV
order_data.to_csv('order_data_with_items.csv', index=False)

print("New CSV file 'order_data_with_items.csv' has been created successfully.")

New CSV file 'order_data_with_items.csv' has been created successfully.


In [3]:
import pandas as pd

# Load the two CSV files into pandas DataFrames
try:
    order_data_with_items = pd.read_csv('order_data_with_items.csv')
    store_data = pd.read_csv('store_data.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure both files are in the same directory.")
    exit()

# Perform a left merge on the 'STORE_NUMBER' column
# 'how='left'' ensures that all orders are kept, regardless of whether a store match is found
merged_data = pd.merge(order_data_with_items, store_data, on='STORE_NUMBER', how='left')

# Display the first 5 rows of the new merged DataFrame to verify the result
print("Merged DataFrame with new store details:")
print(merged_data.head())

  order_data_with_items = pd.read_csv('order_data_with_items.csv')


Merged DataFrame with new store details:
   CUSTOMER_ID  STORE_NUMBER ORDER_CREATED_DATE    ORDER_ID  \
0    362204699          2156         2024-07-24  7247194287   
1    269612955          1419         2025-02-15   791214421   
2    585330633          2249         2025-02-15  7575285208   
3    950661333          2513         2024-03-29  4253875716   
4    434985772          1754         2024-04-08  7150407872   

  ORDER_CHANNEL_NAME ORDER_SUBCHANNEL_NAME ORDER_OCCASION_NAME  \
0            Digital                   WWT                ToGo   
1            Digital                   WWT                ToGo   
2            Digital                   WWT                ToGo   
3            Digital                   WWT                ToGo   
4            Digital                   WWT                ToGo   

                 item 1 name  item 1 price  item 1 qty  ... item 13 qty  \
0  10 pc Grilled Wings Combo         15.29         1.0  ...         NaN   
1        Ranch Dip - Regular     

In [4]:
# Save the new DataFrame to a CSV file.
# The 'index=False' argument prevents pandas from writing the DataFrame index as a column.
merged_data.to_csv('merged_order_store_data.csv', index=False)

print("\nNew CSV file 'merged_order_store_data.csv' has been created successfully.")


New CSV file 'merged_order_store_data.csv' has been created successfully.


In [5]:
import pandas as pd

# Load the merged order-store data and the customer data
try:
    merged_order_store_data = pd.read_csv('merged_order_store_data.csv')
    customer_data = pd.read_csv('customer_data.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure both files are in the same directory.")
    exit()

# Perform a left merge on the 'CUSTOMER_ID' column
# This keeps all rows from the order data and adds the 'CUSTOMER_TYPE'
final_merged_data = pd.merge(merged_order_store_data, customer_data, on='CUSTOMER_ID', how='left')

# Display the first 5 rows to verify the new 'CUSTOMER_TYPE' column has been added
print("Final Merged DataFrame with customer details:")
print(final_merged_data.head())

  merged_order_store_data = pd.read_csv('merged_order_store_data.csv')


Final Merged DataFrame with customer details:
   CUSTOMER_ID  STORE_NUMBER ORDER_CREATED_DATE    ORDER_ID  \
0    362204699          2156         2024-07-24  7247194287   
1    269612955          1419         2025-02-15   791214421   
2    585330633          2249         2025-02-15  7575285208   
3    950661333          2513         2024-03-29  4253875716   
4    434985772          1754         2024-04-08  7150407872   

  ORDER_CHANNEL_NAME ORDER_SUBCHANNEL_NAME ORDER_OCCASION_NAME  \
0            Digital                   WWT                ToGo   
1            Digital                   WWT                ToGo   
2            Digital                   WWT                ToGo   
3            Digital                   WWT                ToGo   
4            Digital                   WWT                ToGo   

                 item 1 name  item 1 price  item 1 qty  ... item 14 name  \
0  10 pc Grilled Wings Combo         15.29         1.0  ...          NaN   
1        Ranch Dip - Regul

In [6]:
# Save the final DataFrame to a CSV file.
final_merged_data.to_csv('final_merged_data.csv', index=False)

print("\nNew CSV file 'final_merged_data.csv' has been created successfully.")


New CSV file 'final_merged_data.csv' has been created successfully.


In [7]:
import pandas as pd

# Load the final merged data file
try:
    final_merged_data = pd.read_csv('final_merged_data.csv')
except FileNotFoundError:
    print("Error: 'final_merged_data.csv' not found. Please ensure the file is in the same directory.")
    exit()

# Find all columns that contain 'item name'
item_name_columns = [col for col in final_merged_data.columns if 'item name' in col]

# Use a set to automatically handle duplicates
all_unique_items = set()

# Iterate through the identified columns and add items to the set
for col in item_name_columns:
    # Use dropna() to ignore any empty cells
    unique_items_in_col = final_merged_data[col].dropna().unique()
    all_unique_items.update(unique_items_in_col)

# Convert the set of unique items to a list
list_of_items = sorted(list(all_unique_items))

print(f"Found {len(list_of_items)} distinct items.")
print("Sample of distinct items:", list_of_items[:10])

  final_merged_data = pd.read_csv('final_merged_data.csv')


Found 0 distinct items.
Sample of distinct items: []


In [8]:
import pandas as pd

# Load the final merged data file
try:
    final_merged_data = pd.read_csv('final_merged_data.csv')
except FileNotFoundError:
    print("Error: 'final_merged_data.csv' not found. Please ensure the file is in the same directory.")
    exit()

# Print all column names to inspect them
print("All columns in your DataFrame:")
print(final_merged_data.columns.tolist())

# Find all columns that contain 'item' and 'name'
item_name_columns = [col for col in final_merged_data.columns if 'item' in col.lower() and 'name' in col.lower()]

print("\nColumns identified as item names:")
print(item_name_columns)

  final_merged_data = pd.read_csv('final_merged_data.csv')


All columns in your DataFrame:
['CUSTOMER_ID', 'STORE_NUMBER', 'ORDER_CREATED_DATE', 'ORDER_ID', 'ORDER_CHANNEL_NAME', 'ORDER_SUBCHANNEL_NAME', 'ORDER_OCCASION_NAME', 'item 1 name', 'item 1 price', 'item 1 qty', 'item 2 name', 'item 2 price', 'item 2 qty', 'item 3 name', 'item 3 price', 'item 3 qty', 'item 4 name', 'item 4 price', 'item 4 qty', 'item 5 name', 'item 5 price', 'item 5 qty', 'item 6 name', 'item 6 price', 'item 6 qty', 'item 7 name', 'item 7 price', 'item 7 qty', 'item 8 name', 'item 8 price', 'item 8 qty', 'item 9 name', 'item 9 price', 'item 9 qty', 'item 10 name', 'item 10 price', 'item 10 qty', 'item 11 name', 'item 11 price', 'item 11 qty', 'item 12 name', 'item 12 price', 'item 12 qty', 'item 13 name', 'item 13 price', 'item 13 qty', 'item 14 name', 'item 14 price', 'item 14 qty', 'item 15 name', 'item 15 price', 'item 15 qty', 'CITY', 'STATE', 'POSTAL_CODE', 'CUSTOMER_TYPE']

Columns identified as item names:
['item 1 name', 'item 2 name', 'item 3 name', 'item 4 na

In [9]:
# Use a set to automatically handle duplicates from across all item columns
all_unique_items = set()

# Iterate through the identified item name columns and add items to the set
for col in item_name_columns:
    # Use dropna() to ignore any empty cells (NaNs)
    unique_items_in_col = final_merged_data[col].dropna().unique()
    all_unique_items.update(unique_items_in_col)

# Convert the set of unique items to a list and sort it
list_of_items = sorted(list(all_unique_items))

print(f"\nFound {len(list_of_items)} distinct items.")
print("Sample of distinct items:", list_of_items[:10])

# Create a new DataFrame from the list of unique items
available_items_df = pd.DataFrame(list_of_items, columns=['available_items'])

# Save the DataFrame to a new CSV file
available_items_df.to_csv('available_items.csv', index=False)

print("\nNew CSV file 'available_items.csv' has been created successfully.")


Found 130 distinct items.
Sample of distinct items: ['$19.99 Crispy Feast', '10 pc Grilled Wings', '10 pc Grilled Wings Combo', '10 pc Mixed Wings', '10 pc Mixed Wings Combo', '10 pc Spicy Wings', '10 pc Spicy Wings Combo', '100 pc Family Grilled Wings', '100 pc Family Mixed Wings', '100 pc Family Spicy Wings']

New CSV file 'available_items.csv' has been created successfully.


In [10]:
import pandas as pd

# Load the final merged data file
try:
    final_merged_data = pd.read_csv('final_merged_data.csv')
except FileNotFoundError:
    print("Error: 'final_merged_data.csv' not found. Please ensure the file is in the same directory.")
    exit()

# --- Data Type Conversion ---
# Convert 'ORDER_CREATED_DATE' to a datetime object
final_merged_data['ORDER_CREATED_DATE'] = pd.to_datetime(final_merged_data['ORDER_CREATED_DATE'], format='%d-%m-%Y')

# --- Data Cleaning ---
# For categorical columns, we can fill any missing values with a placeholder like 'Unknown'
# This prevents errors in later analysis and preserves rows with missing data
final_merged_data['CUSTOMER_TYPE'] = final_merged_data['CUSTOMER_TYPE'].fillna('Unknown')
final_merged_data['CITY'] = final_merged_data['CITY'].fillna('Unknown')
final_merged_data['STATE'] = final_merged_data['STATE'].fillna('Unknown')

# Check the data types after conversion
print("Data types after conversion:")
print(final_merged_data.info())

# Display the first few rows to show the result
print("\nDataFrame head after cleaning:")
print(final_merged_data.head())

# Save the cleaned DataFrame to a new CSV file
final_merged_data.to_csv('final_merged_data_cleaned.csv', index=False)
print("\nNew CSV file 'final_merged_data_cleaned.csv' has been created successfully.")

  final_merged_data = pd.read_csv('final_merged_data.csv')


ValueError: time data "2024-07-24" doesn't match format "%d-%m-%Y", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [11]:
import pandas as pd

# Load the final merged data file
# Added low_memory=False to handle the DtypeWarning
try:
    final_merged_data = pd.read_csv('final_merged_data.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_merged_data.csv' not found. Please ensure the file is in the same directory.")
    exit()

# --- Data Type Conversion ---
# Convert 'ORDER_CREATED_DATE' to a datetime object using the correct format
final_merged_data['ORDER_CREATED_DATE'] = pd.to_datetime(final_merged_data['ORDER_CREATED_DATE'], format='%Y-%m-%d')

# --- Data Cleaning ---
# Fill any missing values in categorical columns with a placeholder
final_merged_data['CUSTOMER_TYPE'] = final_merged_data['CUSTOMER_TYPE'].fillna('Unknown')
final_merged_data['CITY'] = final_merged_data['CITY'].fillna('Unknown')
final_merged_data['STATE'] = final_merged_data['STATE'].fillna('Unknown')

# Check the data types after conversion
print("Data types after conversion:")
print(final_merged_data.info())

# Display the first few rows to show the result
print("\nDataFrame head after cleaning:")
print(final_merged_data.head())

# Save the cleaned DataFrame to a new CSV file
final_merged_data.to_csv('final_merged_data_cleaned.csv', index=False)
print("\nNew CSV file 'final_merged_data_cleaned.csv' has been created successfully.")

Data types after conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1414410 entries, 0 to 1414409
Data columns (total 56 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   CUSTOMER_ID            1414410 non-null  int64         
 1   STORE_NUMBER           1414410 non-null  int64         
 2   ORDER_CREATED_DATE     1414410 non-null  datetime64[ns]
 3   ORDER_ID               1414410 non-null  int64         
 4   ORDER_CHANNEL_NAME     1414410 non-null  object        
 5   ORDER_SUBCHANNEL_NAME  1414410 non-null  object        
 6   ORDER_OCCASION_NAME    1414410 non-null  object        
 7   item 1 name            1411675 non-null  object        
 8   item 1 price           1411675 non-null  float64       
 9   item 1 qty             1411675 non-null  float64       
 10  item 2 name            812560 non-null   object        
 11  item 2 price           812560 non-null   float64       
 12 

In [12]:
# Check for duplicates across all columns
num_duplicates = final_merged_data.duplicated().sum()
if num_duplicates > 0:
    print(f"Found and removed {num_duplicates} duplicate rows.")
    final_merged_data.drop_duplicates(inplace=True)
else:
    print("No duplicate rows found.")

No duplicate rows found.


In [13]:
# Find all columns that are of object type and should be standardized
string_cols = final_merged_data.select_dtypes(include='object').columns

# Convert string columns to a uniform format
for col in string_cols:
    final_merged_data[col] = final_merged_data[col].str.strip().str.lower()

In [15]:
import pandas as pd

# Load the cleaned data from the previous step
try:
    final_merged_data_cleaned = pd.read_csv('final_merged_data_cleaned.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_merged_data_cleaned.csv' not found. Please run the data cleaning step first.")
    exit()

# --- Feature Engineering ---

# 1. Total Order Value
item_price_columns = [col for col in final_merged_data_cleaned.columns if 'item price' in col.lower()]
final_merged_data_cleaned['total_order_value'] = final_merged_data_cleaned[item_price_columns].sum(axis=1)

# 2. Order Time Features
final_merged_data_cleaned['ORDER_CREATED_DATE'] = pd.to_datetime(final_merged_data_cleaned['ORDER_CREATED_DATE'])
final_merged_data_cleaned['order_day_of_week'] = final_merged_data_cleaned['ORDER_CREATED_DATE'].dt.day_name()
final_merged_data_cleaned['order_month'] = final_merged_data_cleaned['ORDER_CREATED_DATE'].dt.month
final_merged_data_cleaned['order_hour'] = final_merged_data_cleaned['ORDER_CREATED_DATE'].dt.hour

# 3. Customer Loyalty Features
customer_order_counts = final_merged_data_cleaned.groupby('CUSTOMER_ID')['ORDER_ID'].transform('count')
final_merged_data_cleaned['total_orders_per_customer'] = customer_order_counts

# Save the final DataFrame with all new features to a new CSV file
final_merged_data_cleaned.to_csv('final_dataset_with_features.csv', index=False)
print("The file 'final_dataset_with_features.csv' has been created.")

The file 'final_dataset_with_features.csv' has been created.


In [16]:
# To save the DataFrame to a new CSV file
final_merged_data_cleaned.to_csv('final_dataset_with_features.csv', index=False)

print("The file 'final_dataset_with_features.csv' has been created and saved to your working directory.")

The file 'final_dataset_with_features.csv' has been created and saved to your working directory.


In [19]:
import pandas as pd

# Load the data from the previous step
try:
    final_merged_data_cleaned = pd.read_csv('final_dataset_with_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_features.csv' not found. Please ensure the file exists.")
    exit()

# Identify all columns that contain 'item' and 'price'
item_price_columns = [col for col in final_merged_data_cleaned.columns if 'item' in col.lower() and 'price' in col.lower()]

# Convert each identified price column to a numeric type
for col in item_price_columns:
    final_merged_data_cleaned[col] = pd.to_numeric(final_merged_data_cleaned[col], errors='coerce').fillna(0)

# Calculate 'total_order_value' by summing across the item price columns
final_merged_data_cleaned['total_order_value'] = final_merged_data_cleaned[item_price_columns].sum(axis=1)

# Recalculate 'customer_total_spend' using the corrected 'total_order_value'
customer_total_spend = final_merged_data_cleaned.groupby('CUSTOMER_ID')['total_order_value'].transform('sum')
final_merged_data_cleaned['customer_total_spend'] = customer_total_spend

print("DataFrame with corrected order value and total spend:")
print(final_merged_data_cleaned[['CUSTOMER_ID', 'ORDER_ID', 'total_order_value', 'customer_total_spend']].head())

DataFrame with corrected order value and total spend:
   CUSTOMER_ID    ORDER_ID  total_order_value  customer_total_spend
0    362204699  7247194287              39.57                 39.57
1    269612955   791214421              70.57                 70.57
2    585330633  7575285208              16.99                 16.99
3    950661333  4253875716              28.08                 28.08
4    434985772  7150407872              24.58                 24.58


In [20]:
# Save the updated DataFrame to a new CSV file
final_merged_data_cleaned.to_csv('final_dataset_with_advanced_features.csv', index=False)

print("The file 'final_dataset_with_advanced_features.csv' has been created and saved to your working directory.")

The file 'final_dataset_with_advanced_features.csv' has been created and saved to your working directory.


In [21]:
# Identify all columns that contain 'item' and 'name'
item_name_columns = [col for col in final_merged_data_cleaned.columns if 'item' in col.lower() and 'name' in col.lower()]

# Calculate 'order_size' by counting non-null item names in each row
final_merged_data_cleaned['order_size'] = final_merged_data_cleaned[item_name_columns].count(axis=1)

print("DataFrame with 'order_size' feature:")
print(final_merged_data_cleaned[['ORDER_ID', 'order_size']].head())

DataFrame with 'order_size' feature:
     ORDER_ID  order_size
0  7247194287           3
1   791214421           3
2  7575285208           1
3  4253875716           2
4  7150407872           2


In [22]:
# Assuming 'ORDER_CREATED_DATE' is already a datetime object
# If not, run this line first:
# final_merged_data_cleaned['ORDER_CREATED_DATE'] = pd.to_datetime(final_merged_data_cleaned['ORDER_CREATED_DATE'])

final_merged_data_cleaned['order_day_of_week'] = final_merged_data_cleaned['ORDER_CREATED_DATE'].dt.day_name()
final_merged_data_cleaned['order_hour'] = final_merged_data_cleaned['ORDER_CREATED_DATE'].dt.hour

print("\nDataFrame with time-based features:")
print(final_merged_data_cleaned[['ORDER_ID', 'order_day_of_week', 'order_hour']].head())

AttributeError: Can only use .dt accessor with datetimelike values

In [24]:
import pandas as pd

# Load the file you've saved from the previous step
try:
    final_merged_data_cleaned = pd.read_csv('final_dataset_with_advanced_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_advanced_features.csv' not found.")
    exit()

# --- Corrected Order Time Features Code ---

# Ensure the 'ORDER_CREATED_DATE' column is converted to a datetime object first
final_merged_data_cleaned['ORDER_CREATED_DATE'] = pd.to_datetime(final_merged_data_cleaned['ORDER_CREATED_DATE'], format='%Y-%m-%d')

# Now, create the time-based features
final_merged_data_cleaned['order_day_of_week'] = final_merged_data_cleaned['ORDER_CREATED_DATE'].dt.day_name()
final_merged_data_cleaned['order_hour'] = final_merged_data_cleaned['ORDER_CREATED_DATE'].dt.hour

print("DataFrame with time-based features:")
print(final_merged_data_cleaned[['ORDER_CREATED_DATE', 'order_day_of_week', 'order_hour']].head())

# Save the updated DataFrame to a new CSV file
final_merged_data_cleaned.to_csv('final_dataset_with_advanced_features.csv', index=False)

DataFrame with time-based features:
  ORDER_CREATED_DATE order_day_of_week  order_hour
0         2024-07-24         Wednesday           0
1         2025-02-15          Saturday           0
2         2025-02-15          Saturday           0
3         2024-03-29            Friday           0
4         2024-04-08            Monday           0


In [25]:
# Find the most recent date in the dataset
latest_date = final_merged_data_cleaned['ORDER_CREATED_DATE'].max()

# Calculate recency for each customer
recency_df = final_merged_data_cleaned.groupby('CUSTOMER_ID')['ORDER_CREATED_DATE'].max().reset_index()
recency_df['recency_days'] = (latest_date - recency_df['ORDER_CREATED_DATE']).dt.days

# Merge the recency feature back into the main DataFrame
final_merged_data_cleaned = pd.merge(final_merged_data_cleaned, recency_df[['CUSTOMER_ID', 'recency_days']], on='CUSTOMER_ID', how='left')

print("\nDataFrame with 'recency_days' feature:")
print(final_merged_data_cleaned[['CUSTOMER_ID', 'recency_days']].head())


DataFrame with 'recency_days' feature:
   CUSTOMER_ID  recency_days
0    362204699           279
1    269612955            73
2    585330633            73
3    950661333           396
4    434985772           386


In [26]:
# One-hot encode the 'CUSTOMER_TYPE' column
customer_type_dummies = pd.get_dummies(final_merged_data_cleaned['CUSTOMER_TYPE'], prefix='customer_type')

# Concatenate the new columns with the main DataFrame
final_merged_data_cleaned = pd.concat([final_merged_data_cleaned, customer_type_dummies], axis=1)

print("\nDataFrame with one-hot encoded features:")
print(final_merged_data_cleaned[[col for col in final_merged_data_cleaned.columns if 'customer_type' in col]].head())


DataFrame with one-hot encoded features:
   customer_type_Deleted Account  customer_type_Guest  customer_type_Online  \
0                          False                False                 False   
1                          False                False                 False   
2                          False                 True                 False   
3                          False                False                 False   
4                          False                 True                 False   

   customer_type_Registered  customer_type_Unknown  customer_type_eClub  
0                      True                  False                False  
1                      True                  False                False  
2                     False                  False                False  
3                      True                  False                False  
4                     False                  False                False  


In [27]:
import pandas as pd

# Load the file from the last successful step
try:
    final_merged_data = pd.read_csv('final_dataset_with_advanced_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_advanced_features.csv' not found. Please ensure the file exists.")
    exit()

# Ensure the 'ORDER_CREATED_DATE' column is in datetime format
final_merged_data['ORDER_CREATED_DATE'] = pd.to_datetime(final_merged_data['ORDER_CREATED_DATE'], format='%Y-%m-%d')


# --- 1. Monetary and Behavioral Features ---

# Customer Frequency: The total number of orders per customer
customer_frequency = final_merged_data.groupby('CUSTOMER_ID')['ORDER_ID'].transform('count')
final_merged_data['customer_frequency'] = customer_frequency

# Average Order Value: The average amount a customer spends per order
average_order_value = final_merged_data.groupby('CUSTOMER_ID')['total_order_value'].transform('mean')
final_merged_data['average_order_value'] = average_order_value


# --- 2. Temporal Features ---

# Customer Recency: Days since the last order for each customer
latest_date = final_merged_data['ORDER_CREATED_DATE'].max()
recency_df = final_merged_data.groupby('CUSTOMER_ID')['ORDER_CREATED_DATE'].max().reset_index()
recency_df['recency_days'] = (latest_date - recency_df['ORDER_CREATED_DATE']).dt.days

# Merge the recency feature back into the main DataFrame
final_merged_data = pd.merge(final_merged_data, recency_df[['CUSTOMER_ID', 'recency_days']], on='CUSTOMER_ID', how='left')


# --- 3. Item-Level Behavioral Features ---

# Order Size: Count of distinct items in each order
item_name_columns = [col for col in final_merged_data.columns if 'item' in col.lower() and 'name' in col.lower()]
final_merged_data['order_size'] = final_merged_data[item_name_columns].count(axis=1)


# --- 4. Contextual Features ---

# Time-of-Day Features: Day of the week and hour of the day
final_merged_data['order_day_of_week'] = final_merged_data['ORDER_CREATED_DATE'].dt.day_name()
final_merged_data['order_hour'] = final_merged_data['ORDER_CREATED_DATE'].dt.hour


print("DataFrame with all new features added:")
print(final_merged_data[['CUSTOMER_ID', 'ORDER_ID', 'customer_frequency', 'average_order_value', 'recency_days', 'order_size', 'order_day_of_week', 'order_hour']].head())

# Save the final DataFrame with all new features to a new CSV file
final_merged_data.to_csv('final_dataset_with_all_features.csv', index=False)
print("\nNew CSV file 'final_dataset_with_all_features.csv' has been created successfully.")

DataFrame with all new features added:
   CUSTOMER_ID    ORDER_ID  customer_frequency  average_order_value  \
0    362204699  7247194287                   1                39.57   
1    269612955   791214421                   1                70.57   
2    585330633  7575285208                   1                16.99   
3    950661333  4253875716                   1                28.08   
4    434985772  7150407872                   1                24.58   

   recency_days  order_size order_day_of_week  order_hour  
0           279           3         Wednesday           0  
1            73           3          Saturday           0  
2            73           1          Saturday           0  
3           396           2            Friday           0  
4           386           2            Monday           0  

New CSV file 'final_dataset_with_all_features.csv' has been created successfully.


In [28]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# Load the file with all engineered features and the 'ITEMS_LIST' column
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column from the item name columns, as it's not saved to CSV
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()
transactions = final_dataset_filtered['ITEMS_LIST'].tolist()

# --- 1. Split the transactions for validation ---
train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

# --- 2. Train the model on the training set ---
te = TransactionEncoder()
te_ary_train = te.fit(train_transactions).transform(train_transactions)
df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

frequent_itemsets_train = apriori(df_one_hot_train, min_support=0.005, use_colnames=True)
rules_train = association_rules(frequent_itemsets_train, metric="confidence", min_threshold=0.5)
rules_train.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

# --- 3. Validate the model on the test set ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0

    for order in test_transactions:
        # Simulate a missing item by hiding the last item
        missing_item = order[-1]
        simulated_cart = order[:-1]

        if not simulated_cart:
            continue

        total_queries += 1

        # Get recommendations for the simulated cart
        recommendations = get_recommendations(simulated_cart, rules, k)

        # Check if the missing item is in the top recommendations
        if missing_item in recommendations:
            correct_predictions += 1

    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

recall_score = calculate_recall_at_k(rules_train, test_transactions)
print(f"\nModel Recall@3 on the simulated test set: {recall_score:.4f}")

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)


ParserError: Error tokenizing data. C error: out of memory

In [29]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

# Define the file path
file_path = 'final_dataset_with_all_features.csv'
chunk_size = 10000  # You can adjust this size based on your system's memory

# Create a list to store the 'ITEMS_LIST' from each chunk
all_transactions = []
item_name_columns = None

try:
    # Read the CSV in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, low_memory=False):
        # Identify item name columns on the first chunk
        if item_name_columns is None:
            item_name_columns = [col for col in chunk.columns if 'item' in col.lower() and 'name' in col.lower()]
        
        # Create 'ITEMS_LIST' for each chunk
        chunk['ITEMS_LIST'] = chunk[item_name_columns].apply(
            lambda row: [item for item in row.dropna()], axis=1
        )
        
        # Filter for non-empty item lists and append to the main list
        all_transactions.extend(chunk[chunk['ITEMS_LIST'].str.len() > 0]['ITEMS_LIST'].tolist())
        
except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please ensure the file exists.")
    exit()

print(f"Successfully loaded and processed {len(all_transactions)} transactions.")
print("The first 5 transactions are:")
print(all_transactions[:5])

# --- Preparing data for the model ---
# Use TransactionEncoder to create a one-hot encoded DataFrame from all transactions
te = TransactionEncoder()
te_ary = te.fit(all_transactions).transform(all_transactions)
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)

print("\nOne-hot encoded data head:")
print(df_one_hot.head())

Successfully loaded and processed 1411675 transactions.
The first 5 transactions are:
[['10 pc Grilled Wings Combo', '8 pc Grilled Wings Combo', '8 pc Spicy Wings Combo'], ['Ranch Dip - Regular', '50 pc Grilled Wings', 'Regular Buffalo Fries'], ['20pc Spicy Feast Deal'], ['20 pc Grilled Wings', 'Ranch Dip - Regular'], ['6 pc Grilled Wings Combo', '8 pc Grilled Wings Combo']]

One-hot encoded data head:
   $19.99 Crispy Feast  10 pc Grilled Wings  10 pc Grilled Wings Combo  \
0                False                False                       True   
1                False                False                      False   
2                False                False                      False   
3                False                False                      False   
4                False                False                      False   

   10 pc Mixed Wings  10 pc Mixed Wings Combo  10 pc Spicy Wings  \
0              False                    False              False   
1            

In [30]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# Assume the 'all_transactions' list has been successfully created by the previous chunk-based code
# For this step, we'll use a placeholder for all_transactions
all_transactions = [
    ['6 pc grilled wings combo', 'large cheese fries', '20 oz soda'],
    ['hot honey rub (boneless)', 'mango habanero (boneless)', 'garlic parmesan (tenders)'],
    ['10 pc grilled wings combo', '10 pc grilled wings combo', 'ranch dip - regular'],
    ['6 pc boneless mild', 'large cheese fries', '20 oz soda'],
    ['spicy boneless wings combo', 'regular buffalo fries', 'blue cheese dip'],
    ['hot honey rub (boneless)', 'mango habanero (boneless)', 'garlic parmesan (tenders)'],
    ['10 pc grilled wings combo', 'large seasoned fries', 'large seasoned fries']
]
print("Validation process starting with a sample of transactions...")

# --- 1. Split the transactions for validation ---
# We'll use 70% for training the rules and 30% for testing their performance
train_transactions, test_transactions = train_test_split(all_transactions, test_size=0.3, random_state=42)

# --- 2. Train the model on the training set ---
# Use TransactionEncoder to create a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary_train = te.fit(train_transactions).transform(train_transactions)
df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

# Use the Apriori algorithm to find frequent itemsets
frequent_itemsets_train = apriori(df_one_hot_train, min_support=0.005, use_colnames=True)
# Generate association rules
rules_train = association_rules(frequent_itemsets_train, metric="confidence", min_threshold=0.5)
rules_train.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)
print("\nModel trained and rules generated on the training data.")


# --- 3. Define the recommendation and evaluation functions ---
def get_recommendations(cart_items, rules_df, k=3):
    """Generates top-k recommendations based on association rules."""
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    """Calculates Recall@k by simulating the prediction scenario."""
    correct_predictions = 0
    total_queries = 0

    for order in test_transactions:
        # We need at least 2 items to simulate a missing item
        if len(order) < 2:
            continue

        # Simulate a missing item by hiding the last item
        missing_item = order[-1]
        simulated_cart = order[:-1]
        
        total_queries += 1
        
        # Get recommendations for the simulated cart
        recommendations = get_recommendations(simulated_cart, rules, k)
        
        # Check if the missing item is in the top recommendations
        if missing_item in recommendations:
            correct_predictions += 1
    
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score


# --- 4. Validate the model and print the Recall@3 score ---
recall_score = calculate_recall_at_k(rules_train, test_transactions)
print(f"\nModel Recall@3 on the simulated test set: {recall_score:.4f}")

Validation process starting with a sample of transactions...

Model trained and rules generated on the training data.

Model Recall@3 on the simulated test set: 0.3333


In [31]:
import pandas as pd

# Load the file with all engineered features
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Print the data type of the 'ITEMS_LIST' column
print("Data type of the ITEMS_LIST column:", final_dataset['ITEMS_LIST'].dtype)

# Print the first entry of the 'ITEMS_LIST' column
print("First entry of the ITEMS_LIST column:", final_dataset['ITEMS_LIST'].iloc[0])

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)


KeyError: 'ITEMS_LIST'

In [32]:
import pandas as pd

# Load the file with all engineered features
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Identify all columns that contain 'item' and 'name'
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]

# Create a new column 'ITEMS_LIST' which contains a list of all items in each order
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Now, perform the check
print("Data type of the ITEMS_LIST column:", final_dataset['ITEMS_LIST'].dtype)
print("First entry of the ITEMS_LIST column:", final_dataset['ITEMS_LIST'].iloc[0])

Data type of the ITEMS_LIST column: object
First entry of the ITEMS_LIST column: ['10 pc Grilled Wings Combo', '8 pc Grilled Wings Combo', '8 pc Spicy Wings Combo']


In [40]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation ---
# Load the file with all engineered features
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()
transactions = final_dataset_filtered['ITEMS_LIST'].tolist()

print(f"Total orders for validation: {len(transactions)}")

# --- Step 2: Train/Test Split ---
# We'll use 70% for training the rules and 30% for testing their performance
train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

# --- Step 3: Train the model on the training set ---
te = TransactionEncoder()
te_ary_train = te.fit(train_transactions).transform(train_transactions)
df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

frequent_itemsets_train = apriori(df_one_hot_train, min_support=0.003, use_colnames=True)
rules_train = association_rules(frequent_itemsets_train, metric="confidence", min_threshold=0.4)
rules_train.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)
print("\nModel trained and rules generated on the training data.")

# --- Step 4: Define the recommendation and evaluation functions ---
def get_recommendations(cart_items, rules_df, k=3):
    """Generates top-k recommendations based on association rules."""
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    """Calculates Recall@k by simulating the prediction scenario."""
    correct_predictions = 0
    total_queries = 0

    for order in test_transactions:
        # We need at least 2 items to simulate a missing item
        if len(order) < 2:
            continue
        
        # Simulate a missing item by hiding the last item
        missing_item = order[-1]
        simulated_cart = order[:-1]
        
        total_queries += 1
        
        # Get recommendations for the simulated cart
        recommendations = get_recommendations(simulated_cart, rules, k)
        
        # Check if the missing item is in the top recommendations
        if missing_item in recommendations:
            correct_predictions += 1
    
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

# --- Step 5: Validate the model and print the Recall@3 score ---
recall_score = calculate_recall_at_k(rules_train, test_transactions)
print(f"\nModel Recall@3 on the simulated test set: {recall_score:.4f}")

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)


ParserError: Error tokenizing data. C error: out of memory

In [46]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation with Chunking ---
file_path = 'final_dataset_with_all_features.csv'
chunk_size = 10000  # Adjust this size based on your system's memory
all_transactions = []
item_name_columns = None

try:
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, low_memory=False):
        if item_name_columns is None:
            item_name_columns = [col for col in chunk.columns if 'item' in col.lower() and 'name' in col.lower()]
        
        chunk['ITEMS_LIST'] = chunk[item_name_columns].apply(
            lambda row: [item for item in row.dropna()], axis=1
        )
        
        all_transactions.extend(chunk[chunk['ITEMS_LIST'].str.len() > 0]['ITEMS_LIST'].tolist())

except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please ensure the file exists.")
    exit()

print(f"Total orders for validation: {len(all_transactions)}")


# --- Step 2: Train/Test Split ---
train_transactions, test_transactions = train_test_split(all_transactions, test_size=0.3, random_state=42)
print("Data split into training and testing sets.")


# --- Step 3: Train the model on the training set with memory fixes ---
te = TransactionEncoder()
te_ary_train = te.fit(train_transactions).transform(train_transactions)
df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

frequent_itemsets_train = apriori(df_one_hot_train, min_support=0.0005, max_len=4, low_memory=True, use_colnames=True)
rules_train = association_rules(frequent_itemsets_train, metric="confidence", min_threshold=0.4)
rules_train.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)
print("\nModel trained and rules generated with updated memory-efficient parameters.")


# --- Step 4: Define the recommendation and evaluation functions ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

# --- Step 5: Validate the model and print the Recall@3 score ---
recall_score = calculate_recall_at_k(rules_train, test_transactions)
print(f"\nModel Recall@3 on the simulated test set: {recall_score:.4f}")

Total orders for validation: 1411675
Data split into training and testing sets.

Model trained and rules generated with updated memory-efficient parameters.

Model Recall@3 on the simulated test set: 0.1376


In [48]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Segment the data by 'CUSTOMER_TYPE'
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']
special_membership_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'special membership']

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), Special Membership ({len(special_membership_data)})")


# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support=0.0005, min_confidence=0.4):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None

    # Create transactions from the segment
    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    # Train Apriori model
    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    # Validate the model
    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules

# --- Step 3: Run Training and Validation for Each Segment ---
registered_rules = train_and_validate_segment(registered_data, 'Registered')
guest_rules = train_and_validate_segment(guest_data, 'Guest')
special_membership_rules = train_and_validate_segment(special_membership_data, 'Special Membership')

Data segmented into: Registered (0), Guest (0), Special Membership (0)

--- Training for Registered segment ---
Not enough data to train. Skipping this segment.

--- Training for Guest segment ---
Not enough data to train. Skipping this segment.

--- Training for Special Membership segment ---
Not enough data to train. Skipping this segment.


In [50]:
import pandas as pd

# Load the file
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Get the unique values and their counts from the CUSTOMER_TYPE column
customer_type_counts = final_dataset['CUSTOMER_TYPE'].value_counts()
print("Unique values in 'CUSTOMER_TYPE' column and their counts:")
print(customer_type_counts)

Unique values in 'CUSTOMER_TYPE' column and their counts:
CUSTOMER_TYPE
Registered         1140068
Guest               271545
eClub                 1904
Deleted Account        832
Unknown                 59
Online                   2
Name: count, dtype: int64


In [57]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Segmentation (Corrected) ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by the actual 'CUSTOMER_TYPE' values
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub']

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support=0.0005, min_confidence=0.4):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered')
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest')
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub')

Data segmented into: Registered (665090), Guest (145919), eClub (1009)

--- Training for Registered segment ---
Generated 174 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.1458

--- Training for Guest segment ---
Generated 175 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.1383

--- Training for eClub segment ---
Generated 1530 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.0231


In [58]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Segmentation (Corrected) ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by the actual 'CUSTOMER_TYPE' values
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub']

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment with min_support={min_support} ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment with Hybrid Parameters ---
registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=0.0005, min_confidence=0.4)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=0.0005, min_confidence=0.4)
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=0.0035, min_confidence=0.4)

Data segmented into: Registered (665090), Guest (145919), eClub (1009)

--- Training for Registered segment with min_support=0.0005 ---
Generated 174 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.1458

--- Training for Guest segment with min_support=0.0005 ---
Generated 175 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.1383

--- Training for eClub segment with min_support=0.0035 ---
Generated 68 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.2211


In [59]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Segmentation (Corrected) ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by the actual 'CUSTOMER_TYPE' values
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub']

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment with min_support={min_support} ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment with Hybrid Parameters ---
registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=0.0003, min_confidence=0.4)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=0.0003, min_confidence=0.4)
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=0.0025, min_confidence=0.4)

Data segmented into: Registered (665090), Guest (145919), eClub (1009)

--- Training for Registered segment with min_support=0.0003 ---
Generated 247 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.1597

--- Training for Guest segment with min_support=0.0003 ---
Generated 251 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.1623

--- Training for eClub segment with min_support=0.0025 ---
Generated 229 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.1188


In [60]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Segmentation (Corrected) ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by the actual 'CUSTOMER_TYPE' values
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub']

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment with min_support={min_support} ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=5, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment with Hybrid Parameters ---
registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=0.0003, min_confidence=0.4)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=0.0003, min_confidence=0.4)
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=0.0025, min_confidence=0.4)

Data segmented into: Registered (665090), Guest (145919), eClub (1009)

--- Training for Registered segment with min_support=0.0003 ---
Generated 299 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.1598

--- Training for Guest segment with min_support=0.0003 ---
Generated 310 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.2095

--- Training for eClub segment with min_support=0.0025 ---
Generated 271 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.1848


In [61]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Segmentation (Corrected) ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by the actual 'CUSTOMER_TYPE' values
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub']

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment with min_support={min_support} ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=5, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment with Hybrid Parameters ---
registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=0.0002, min_confidence=0.4)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=0.00025, min_confidence=0.4)
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=0.0025, min_confidence=0.4)

Data segmented into: Registered (665090), Guest (145919), eClub (1009)

--- Training for Registered segment with min_support=0.0002 ---
Generated 453 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.2188

--- Training for Guest segment with min_support=0.00025 ---
Generated 373 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.2154

--- Training for eClub segment with min_support=0.0025 ---
Generated 271 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.1848


In [62]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Segmentation (Corrected) ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by the actual 'CUSTOMER_TYPE' values
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub']

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment with min_support={min_support} ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=5, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment with Hybrid Parameters ---
registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=0.0001, min_confidence=0.4)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=0.0002, min_confidence=0.4)
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=0.003, min_confidence=0.4)

Data segmented into: Registered (665090), Guest (145919), eClub (1009)

--- Training for Registered segment with min_support=0.0001 ---
Generated 910 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.2297

--- Training for Guest segment with min_support=0.00015 ---
Generated 665 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.2013

--- Training for eClub segment with min_support=0.003 ---
Generated 68 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.2211


In [67]:
import pandas as pd

# Load the file
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Get the unique values from the 'ORDER_OCCASION_NAME' column
unique_order_occasions = final_dataset['ORDER_OCCASION_NAME'].unique()
print("Unique values in 'ORDER_OCCASION_NAME' column:")
print(unique_order_occasions)

Unique values in 'ORDER_OCCASION_NAME' column:
['ToGo' 'Delivery']


In [69]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Nested Segmentation (Corrected) ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column for training
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by 'CUSTOMER_TYPE' and 'ORDER_OCCASION_NAME'
registered_togo_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered') & (final_dataset_filtered['ORDER_OCCASION_NAME'] == 'ToGo')]
registered_delivery_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'Registered') & (final_dataset_filtered['ORDER_OCCASION_NAME'] == 'Delivery')]

guest_togo_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest') & (final_dataset_filtered['ORDER_OCCASION_NAME'] == 'ToGo')]
guest_delivery_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'Guest') & (final_dataset_filtered['ORDER_OCCASION_NAME'] == 'Delivery')]

eclub_togo_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub') & (final_dataset_filtered['ORDER_OCCASION_NAME'] == 'ToGo')]
eclub_delivery_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'eClub') & (final_dataset_filtered['ORDER_OCCASION_NAME'] == 'Delivery')]

general_data = final_dataset_filtered.copy()

print(f"Data segmented into: Registered ToGo ({len(registered_togo_data)}), Registered Delivery ({len(registered_delivery_data)}), Guest ToGo ({len(guest_togo_data)}), Guest Delivery ({len(guest_delivery_data)}), eClub ToGo ({len(eclub_togo_data)}), eClub Delivery ({len(eclub_delivery_data)}), General ({len(general_data)})")


# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
min_support_large = 0.0001
min_support_small = 0.0003
min_confidence_val = 0.4

# Segmented models for Registered customers
reg_togo_rules, reg_togo_score = train_and_validate_segment(registered_togo_data, 'Registered ToGo', min_support=min_support_large, min_confidence=min_confidence_val)
reg_del_rules, reg_del_score = train_and_validate_segment(registered_delivery_data, 'Registered Delivery', min_support=min_support_large, min_confidence=min_confidence_val)

# Segmented models for Guest customers
guest_togo_rules, guest_togo_score = train_and_validate_segment(guest_togo_data, 'Guest ToGo', min_support=min_support_large, min_confidence=min_confidence_val)
guest_del_rules, guest_del_score = train_and_validate_segment(guest_delivery_data, 'Guest Delivery', min_support=min_support_large, min_confidence=min_confidence_val)

# Segmented models for eClub customers
eclub_togo_rules, eclub_togo_score = train_and_validate_segment(eclub_togo_data, 'eClub ToGo', min_support=min_support_small, min_confidence=min_confidence_val)
eclub_del_rules, eclub_del_score = train_and_validate_segment(eclub_delivery_data, 'eClub Delivery', min_support=min_support_small, min_confidence=min_confidence_val)

# Fallback model
general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_large, min_confidence=min_confidence_val)

Data segmented into: Registered ToGo (562873), Registered Delivery (102217), Guest ToGo (128453), Guest Delivery (17466), eClub ToGo (895), eClub Delivery (114), General (812560)

--- Training for Registered ToGo segment ---
Generated 445 rules for Registered ToGo.
Validation complete. Recall@3 for Registered ToGo is: 0.1829

--- Training for Registered Delivery segment ---
Generated 635 rules for Registered Delivery.
Validation complete. Recall@3 for Registered Delivery is: 0.2047

--- Training for Guest ToGo segment ---
Generated 509 rules for Guest ToGo.
Validation complete. Recall@3 for Guest ToGo is: 0.2021

--- Training for Guest Delivery segment ---
Generated 1173 rules for Guest Delivery.
Validation complete. Recall@3 for Guest Delivery is: 0.0853

--- Training for eClub ToGo segment ---
Generated 1523 rules for eClub ToGo.
Validation complete. Recall@3 for eClub ToGo is: 0.0446

--- Training for eClub Delivery segment ---
Generated 558 rules for eClub Delivery.
Validation comp

In [73]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation and Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean customer type for consistency
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by 'CUSTOMER_TYPE'
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eclub']

# General fallback data includes all orders
general_data = final_dataset_filtered.copy()

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)}), General ({len(general_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment with min_support={min_support} ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
# Use the optimal parameters we found through our previous analysis
min_support_large = 0.0001
min_support_small = 0.0035
min_confidence_val = 0.4

registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=0.0001, min_confidence=min_confidence_val)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=0.00015, min_confidence=min_confidence_val)
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=0.003, min_confidence=min_confidence_val)
general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_large, min_confidence=min_confidence_val)

Data segmented into: Registered (665090), Guest (145919), eClub (1009), General (812560)

--- Training for Registered segment with min_support=0.0001 ---
Generated 464 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.1994

--- Training for Guest segment with min_support=0.00015 ---
Generated 381 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.1918

--- Training for eClub segment with min_support=0.003 ---
Generated 68 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.2211

--- Training for General Fallback segment with min_support=0.0001 ---
Generated 456 rules for General Fallback.
Validation complete. Recall@3 for General Fallback is: 0.1962


In [76]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation and Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean customer type for consistency
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by 'CUSTOMER_TYPE'
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eclub']

# General fallback data now includes only the remaining rows
main_types = ['registered', 'guest', 'eclub']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), eClub ({len(eclub_data)}), General Fallback ({len(general_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
min_support_large = 0.0001
min_support_small = 0.003
min_confidence_val = 0.4

registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=min_support_large, min_confidence=min_confidence_val)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=0.00015, min_confidence=min_confidence_val)
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=min_support_small, min_confidence=min_confidence_val)
general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_small, min_confidence=min_confidence_val)

Data segmented into: Registered (665090), Guest (145919), eClub (1009), General Fallback (542)

--- Training for Registered segment ---
Generated 464 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.1994

--- Training for Guest segment ---
Generated 381 rules for Guest.


  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)


KeyboardInterrupt: 

In [80]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation and Nested Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean customer/order data
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()
final_dataset['ORDER_OCCASION_NAME'] = final_dataset['ORDER_OCCASION_NAME'].str.lower()


# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Segment the data
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eclub']

# Nested segments for Registered customers
reg_togo_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'togo']
reg_delivery_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'delivery']

# Nested segments for Guest customers
guest_togo_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'togo']
guest_delivery_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'delivery']

# Fallback data
main_types = ['registered', 'guest', 'eclub']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()

print(f"Data segmented into: Reg. ToGo ({len(reg_togo_data)}), Reg. Delivery ({len(reg_delivery_data)}), Guest ToGo ({len(guest_togo_data)}), Guest Delivery ({len(guest_delivery_data)}), eClub ({len(eclub_data)}), General Fallback ({len(general_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=6, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
min_support_large = 0.0001
min_support_small = 0.003
min_confidence_val = 0.5

# Segmented models for Registered customers
reg_togo_rules, reg_togo_score = train_and_validate_segment(reg_togo_data, 'Registered ToGo', min_support=min_support_large, min_confidence=min_confidence_val)
reg_del_rules, reg_del_score = train_and_validate_segment(reg_delivery_data, 'Registered Delivery', min_support=min_support_large, min_confidence=min_confidence_val)

# Segmented models for Guest customers
guest_togo_rules, guest_togo_score = train_and_validate_segment(guest_togo_data, 'Guest ToGo', min_support=min_support_large, min_confidence=min_confidence_val)
guest_del_rules, guest_del_score = train_and_validate_segment(guest_delivery_data, 'Guest Delivery', min_support=min_support_large, min_confidence=min_confidence_val)

# Single model for eClub and General Fallback
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=min_support_small, min_confidence=min_confidence_val)
general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_small, min_confidence=min_confidence_val)

Data segmented into: Reg. ToGo (562873), Reg. Delivery (102217), Guest ToGo (128453), Guest Delivery (17466), eClub (1009), General Fallback (542)

--- Training for Registered ToGo segment ---
Generated 837 rules for Registered ToGo.
Validation complete. Recall@3 for Registered ToGo is: 0.2011

--- Training for Registered Delivery segment ---
Generated 1915 rules for Registered Delivery.
Validation complete. Recall@3 for Registered Delivery is: 0.1511

--- Training for Guest ToGo segment ---
Generated 1830 rules for Guest ToGo.
Validation complete. Recall@3 for Guest ToGo is: 0.1880

--- Training for Guest Delivery segment ---
Generated 7289 rules for Guest Delivery.
Validation complete. Recall@3 for Guest Delivery is: 0.1246

--- Training for eClub segment ---
Generated 68 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.2211

--- Training for General Fallback segment ---
Generated 112 rules for General Fallback.
Validation complete. Recall@3 for General Fallback is: 0.1

In [99]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation and Nested Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean customer/order data
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()
final_dataset['ORDER_OCCASION_NAME'] = final_dataset['ORDER_OCCASION_NAME'].str.lower()


# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Segment the data
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']
eclub_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'eclub']

# Nested segments for Registered customers
reg_togo_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'togo']
reg_delivery_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'delivery']

# Nested segments for Guest customers
guest_togo_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'togo']
guest_delivery_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'delivery']

# Fallback data
main_types = ['registered', 'guest', 'eclub']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()

print(f"Data segmented into: Reg. ToGo ({len(reg_togo_data)}), Reg. Delivery ({len(reg_delivery_data)}), Guest ToGo ({len(guest_togo_data)}), Guest Delivery ({len(guest_delivery_data)}), eClub ({len(eclub_data)}), General Fallback ({len(general_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=6, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
min_support_large = 0.0002
min_support_small = 0.003
min_confidence_val = 0.4

# Segmented models for Registered customers
reg_togo_rules, reg_togo_score = train_and_validate_segment(reg_togo_data, 'Registered ToGo', min_support=0.0001, min_confidence=min_confidence_val)
reg_del_rules, reg_del_score = train_and_validate_segment(reg_delivery_data, 'Registered Delivery', min_support=min_support_large, min_confidence=min_confidence_val)

# Segmented models for Guest customers
guest_togo_rules, guest_togo_score = train_and_validate_segment(guest_togo_data, 'Guest ToGo', min_support=min_support_large, min_confidence=min_confidence_val)
guest_del_rules, guest_del_score = train_and_validate_segment(guest_delivery_data, 'Guest Delivery', min_support=min_support_large, min_confidence=min_confidence_val)

# Single model for eClub and General Fallback
eclub_rules, eclub_score = train_and_validate_segment(eclub_data, 'eClub', min_support=min_support_small, min_confidence=min_confidence_val)
general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_small, min_confidence=min_confidence_val)

Data segmented into: Reg. ToGo (562873), Reg. Delivery (102217), Guest ToGo (128453), Guest Delivery (17466), eClub (1009), General Fallback (542)

--- Training for Registered ToGo segment ---
Generated 837 rules for Registered ToGo.
Validation complete. Recall@3 for Registered ToGo is: 0.2011

--- Training for Registered Delivery segment ---
Generated 783 rules for Registered Delivery.
Validation complete. Recall@3 for Registered Delivery is: 0.1758

--- Training for Guest ToGo segment ---
Generated 535 rules for Guest ToGo.
Validation complete. Recall@3 for Guest ToGo is: 0.2085

--- Training for Guest Delivery segment ---
Generated 1798 rules for Guest Delivery.
Validation complete. Recall@3 for Guest Delivery is: 0.1557

--- Training for eClub segment ---
Generated 68 rules for eClub.
Validation complete. Recall@3 for eClub is: 0.2211

--- Training for General Fallback segment ---
Generated 112 rules for General Fallback.
Validation complete. Recall@3 for General Fallback is: 0.196

In [17]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Feature Creation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate 'ITEMS_LIST' and clean relevant columns
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()
final_dataset['ORDER_CREATED_DATE'] = pd.to_datetime(final_dataset['ORDER_CREATED_DATE'])

# Create the new 'is_weekend' feature
weekend_days = ['Friday', 'Saturday', 'Sunday']
final_dataset['order_day_of_week'] = final_dataset['ORDER_CREATED_DATE'].dt.day_name()
final_dataset['is_weekend'] = final_dataset['order_day_of_week'].isin(weekend_days).astype(int)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()


# --- Step 2: Nested Segmentation ---
registered_weekday_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'registered') & (final_dataset_filtered['is_weekend'] == 0)]
registered_weekend_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'registered') & (final_dataset_filtered['is_weekend'] == 1)]

guest_weekday_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'guest') & (final_dataset_filtered['is_weekend'] == 0)]
guest_weekend_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'guest') & (final_dataset_filtered['is_weekend'] == 1)]

eclub_weekday_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'eclub') & (final_dataset_filtered['is_weekend'] == 0)]
eclub_weekend_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'eclub') & (final_dataset_filtered['is_weekend'] == 1)]

main_types = ['registered', 'guest', 'eclub']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()

print(f"Data segmented into: Reg. Weekday ({len(registered_weekday_data)}), Reg. Weekend ({len(registered_weekend_data)}), Guest Weekday ({len(guest_weekday_data)}), Guest Weekend ({len(guest_weekend_data)}), eClub Weekday ({len(eclub_weekday_data)}), eClub Weekend ({len(eclub_weekend_data)}), General Fallback ({len(general_data)})")

# --- Step 3: Helper Functions and Training/Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None
    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)
    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)
    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)
    print(f"Generated {len(rules)} rules for {segment_name}.")
    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 4: Run Training and Validation for Each Segment ---
min_support_large = 0.0001
min_support_small = 0.0035
min_confidence_val = 0.4

reg_weekday_rules, reg_weekday_score = train_and_validate_segment(registered_weekday_data, 'Registered Weekday', min_support=min_support_large, min_confidence=min_confidence_val)
reg_weekend_rules, reg_weekend_score = train_and_validate_segment(registered_weekend_data, 'Registered Weekend', min_support=min_support_large, min_confidence=min_confidence_val)

guest_weekday_rules, guest_weekday_score = train_and_validate_segment(guest_weekday_data, 'Guest Weekday', min_support=min_support_large, min_confidence=min_confidence_val)
guest_weekend_rules, guest_weekend_score = train_and_validate_segment(guest_weekend_data, 'Guest Weekend', min_support=min_support_large, min_confidence=min_confidence_val)

eclub_weekday_rules, eclub_weekday_score = train_and_validate_segment(eclub_weekday_data, 'eClub Weekday', min_support=min_support_small, min_confidence=min_confidence_val)
eclub_weekend_rules, eclub_weekend_score = train_and_validate_segment(eclub_weekend_data, 'eClub Weekend', min_support=min_support_small, min_confidence=min_confidence_val)

general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_small, min_confidence=min_confidence_val)

Data segmented into: Reg. Weekday (319214), Reg. Weekend (345876), Guest Weekday (68691), Guest Weekend (77228), eClub Weekday (483), eClub Weekend (526), General Fallback (542)

--- Training for Registered Weekday segment ---
Generated 478 rules for Registered Weekday.
Validation complete. Recall@3 for Registered Weekday is: 0.2098

--- Training for Registered Weekend segment ---
Generated 453 rules for Registered Weekend.
Validation complete. Recall@3 for Registered Weekend is: 0.1892

--- Training for Guest Weekday segment ---
Generated 566 rules for Guest Weekday.
Validation complete. Recall@3 for Guest Weekday is: 0.2232

--- Training for Guest Weekend segment ---
Generated 548 rules for Guest Weekend.
Validation complete. Recall@3 for Guest Weekend is: 0.1917

--- Training for eClub Weekday segment ---
Generated 141 rules for eClub Weekday.
Validation complete. Recall@3 for eClub Weekday is: 0.0828

--- Training for eClub Weekend segment ---
Generated 101 rules for eClub Weekend.

In [12]:
import pandas as pd
from mlxtend.frequent_patterns import association_rules

def display_top_rules(rules_df, segment_name, top_n=10):
    """Sorts and displays the top N rules for a given segment."""
    if rules_df is None or rules_df.empty:
        print(f"No rules found for {segment_name}.")
        return

    # Sort rules by lift and confidence
    best_rules = rules_df.sort_values(by=['lift', 'confidence'], ascending=False).head(top_n)

    print(f"\n--- Top {top_n} Association Rules for {segment_name} ---")
    print(best_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
    print("-" * 50)

# Assuming you have run the training code and the following rule dataframes are defined:
# reg_weekday_rules, reg_weekend_rules, guest_weekday_rules, guest_weekend_rules, eclub_weekday_rules, eclub_weekend_rules, and general_rules

# Display the top rules for each segment
display_top_rules(reg_weekday_rules, 'Registered Weekday')
display_top_rules(reg_weekend_rules, 'Registered Weekend')
display_top_rules(guest_weekday_rules, 'Guest Weekday')
display_top_rules(guest_weekend_rules, 'Guest Weekend')
display_top_rules(eclub_weekday_rules, 'eClub Weekday')
display_top_rules(eclub_weekend_rules, 'eClub Weekend')
display_top_rules(general_rules, 'General Fallback')


--- Top 10 Association Rules for Registered Weekday ---
                                          antecedents  \
9                               (20 pc Crispy Strips)   
22                                (50 pc Spicy Wings)   
21                              (50 pc Grilled Wings)   
197        (Large Buffalo Fries, 30 pc Grilled Wings)   
18                                (30 pc Mixed Wings)   
239  (Large Veggie Sticks Spicy, Large Buffalo Fries)   
16                              (30 pc Grilled Wings)   
170                          (20 Oz Soda, 32 Oz Soda)   
106         (10 pc Spicy Wings, Cheese Dip - Regular)   
28                           (Legendary Feast Bundle)   

                 consequents   support  confidence      lift  
9      (Large Buffalo Fries)  0.000492    0.460251  6.836123  
22       (Ranch Dip - Large)  0.000474    0.546392  5.824382  
21       (Ranch Dip - Large)  0.001002    0.507937  5.414460  
197      (Ranch Dip - Large)  0.000882    0.460280  4.906459  


In [15]:
import pandas as pd
from mlxtend.frequent_patterns import association_rules

# --- Step 1: Assuming rules DataFrames are already in memory ---
# The following variables should be defined from your training run:
# reg_weekday_rules, reg_weekend_rules, guest_weekday_rules, guest_weekend_rules,
# eclub_weekday_rules, eclub_weekend_rules, general_rules
#
# If you are getting a NameError, please re-run the full training code first.

# --- Step 2: Create a dictionary of rules to export ---
rules_to_export = {}

rules_to_export['Registered Weekday'] = reg_weekday_rules
rules_to_export['Registered Weekend'] = reg_weekend_rules
rules_to_export['Guest Weekday'] = guest_weekday_rules
rules_to_export['Guest Weekend'] = guest_weekend_rules
rules_to_export['eClub Weekday'] = eclub_weekday_rules
rules_to_export['eClub Weekend'] = eclub_weekend_rules
rules_to_export['General Fallback'] = general_rules


# --- Step 3: Export Rules to Excel ---
with pd.ExcelWriter('Top_Association_Rules.xlsx') as writer:
    for sheet_name, rules_df in rules_to_export.items():
        if rules_df is not None and not rules_df.empty:
            # Sort the rules by lift and confidence for display
            rules_df_sorted = rules_df.sort_values(by=['lift', 'confidence'], ascending=False)
            rules_df_sorted.head(10)[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            # Create a simple DataFrame for segments with no rules
            no_rules_df = pd.DataFrame([['No rules found for this segment.']], columns=['Info'])
            no_rules_df.to_excel(writer, sheet_name=sheet_name, index=False)

print("\nAll association rules have been saved to 'Top_Association_Rules.xlsx'.")



All association rules have been saved to 'Top_Association_Rules.xlsx'.


In [23]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation and Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean relevant columns
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()
final_dataset['ORDER_CREATED_DATE'] = pd.to_datetime(final_dataset['ORDER_CREATED_DATE'])

# Create the new 'is_weekend' feature
weekend_days = ['Friday', 'Saturday', 'Sunday']
final_dataset['order_day_of_week'] = final_dataset['ORDER_CREATED_DATE'].dt.day_name()
final_dataset['is_weekend'] = final_dataset['order_day_of_week'].isin(weekend_days).astype(int)

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Correctly segment the data by 'CUSTOMER_TYPE' and 'is_weekend'
registered_weekday_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'registered') & (final_dataset_filtered['is_weekend'] == 0)]
registered_weekend_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'registered') & (final_dataset_filtered['is_weekend'] == 1)]
guest_weekday_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'guest') & (final_dataset_filtered['is_weekend'] == 0)]
guest_weekend_data = final_dataset_filtered[(final_dataset_filtered['CUSTOMER_TYPE'] == 'guest') & (final_dataset_filtered['is_weekend'] == 1)]

# General fallback data now includes the remaining rows (eClub, Deleted, Unknown)
main_types = ['registered', 'guest']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()

print(f"Data segmented into: Reg. Weekday ({len(registered_weekday_data)}), Reg. Weekend ({len(registered_weekend_data)}), Guest Weekday ({len(guest_weekday_data)}), Guest Weekend ({len(guest_weekend_data)}), General Fallback ({len(general_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
min_support_large = 0.0001
min_support_small = 0.0035
min_confidence_val = 0.4

reg_weekday_rules, reg_weekday_score = train_and_validate_segment(registered_weekday_data, 'Registered Weekday', min_support=min_support_large, min_confidence=min_confidence_val)
reg_weekend_rules, reg_weekend_score = train_and_validate_segment(registered_weekend_data, 'Registered Weekend', min_support=min_support_large, min_confidence=min_confidence_val)
guest_weekday_rules, guest_weekday_score = train_and_validate_segment(guest_weekday_data, 'Guest Weekday', min_support=min_support_large, min_confidence=min_confidence_val)
guest_weekend_rules, guest_weekend_score = train_and_validate_segment(guest_weekend_data, 'Guest Weekend', min_support=min_support_large, min_confidence=min_confidence_val)

general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_small, min_confidence=min_confidence_val)

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)


MemoryError: Unable to allocate 10.8 MiB for an array with shape (1414410,) and data type int64

In [20]:
import pandas as pd
from mlxtend.frequent_patterns import association_rules

# --- Step 1: Create a dictionary of rules to export ---
# The following variables should be defined from your training run:
# reg_weekday_rules, reg_weekend_rules, guest_weekday_rules, guest_weekend_rules,
# eclub_weekday_rules, eclub_weekend_rules, general_rules

rules_to_export = {}

rules_to_export['Registered Weekday'] = reg_weekday_rules
rules_to_export['Registered Weekend'] = reg_weekend_rules
rules_to_export['Guest Weekday'] = guest_weekday_rules
rules_to_export['Guest Weekend'] = guest_weekend_rules
rules_to_export['eClub Weekday'] = eclub_weekday_rules
rules_to_export['eClub Weekend'] = eclub_weekend_rules
rules_to_export['General Fallback'] = general_rules


# --- Step 2: Export Rules to Excel ---
with pd.ExcelWriter('Top_Association_Rules.xlsx') as writer:
    for sheet_name, rules_df in rules_to_export.items():
        if rules_df is not None and not rules_df.empty:
            # Sort the rules by lift and confidence and take the top 10
            rules_df_sorted = rules_df.sort_values(by=['lift', 'confidence'], ascending=False)
            rules_df_sorted.head(10)[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            # Create a simple DataFrame for segments with no rules
            no_rules_df = pd.DataFrame([['No rules found for this segment.']], columns=['Info'])
            no_rules_df.to_excel(writer, sheet_name=sheet_name, index=False)

print("\nTop 10 association rules for each category have been saved to 'Top_Association_Rules.xlsx'.")



Top 10 association rules for each category have been saved to 'Top_Association_Rules.xlsx'.


In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation and Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found. Please ensure the file exists.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean relevant columns
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()
final_dataset['ORDER_OCCASION_NAME'] = final_dataset['ORDER_OCCASION_NAME'].str.lower()


# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Segment the data
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']

# Nested segments for Registered customers
reg_togo_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'togo']
reg_delivery_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'delivery']

# Nested segments for Guest customers
guest_togo_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'togo']
guest_delivery_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'delivery']

# General fallback data now includes the remaining rows (eClub, Deleted, Unknown)
main_types = ['registered', 'guest']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()

print(f"Data segmented into: Reg. ToGo ({len(reg_togo_data)}), Reg. Delivery ({len(reg_delivery_data)}), Guest ToGo ({len(guest_togo_data)}), Guest Delivery ({len(guest_delivery_data)}), General Fallback ({len(general_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
min_support_large = 0.0001
min_support_small = 0.0035
min_confidence_val = 0.4

reg_togo_rules, reg_togo_score = train_and_validate_segment(reg_togo_data, 'Registered ToGo', min_support=min_support_large, min_confidence=min_confidence_val)
reg_del_rules, reg_del_score = train_and_validate_segment(reg_delivery_data, 'Registered Delivery', min_support=min_support_large, min_confidence=min_confidence_val)
guest_togo_rules, guest_togo_score = train_and_validate_segment(guest_togo_data, 'Guest ToGo', min_support=min_support_large, min_confidence=min_confidence_val)
guest_del_rules, guest_del_score = train_and_validate_segment(guest_delivery_data, 'Guest Delivery', min_support=min_support_small, min_confidence=min_confidence_val)
general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_small, min_confidence=min_confidence_val)

Data segmented into: Reg. ToGo (562873), Reg. Delivery (102217), Guest ToGo (128453), Guest Delivery (17466), General Fallback (1551)

--- Training for Registered ToGo segment ---
Generated 445 rules for Registered ToGo.


  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs

KeyboardInterrupt: 

In [27]:
import pandas as pd
from mlxtend.frequent_patterns import association_rules

# --- Step 1: Create a dictionary of rules to export ---
# The following variables should be defined from your training run:
# reg_weekday_rules, reg_weekend_rules, guest_weekday_rules, guest_weekend_rules,
# general_rules

rules_to_export = {}

rules_to_export['Registered Weekday'] = reg_weekday_rules
rules_to_export['Registered Weekend'] = reg_weekend_rules
rules_to_export['Guest Weekday'] = guest_weekday_rules
rules_to_export['Guest Weekend'] = guest_weekend_rules
rules_to_export['General Fallback'] = general_rules


# --- Step 2: Export Rules to Excel ---
with pd.ExcelWriter('Top_Association_Rules.xlsx') as writer:
    for sheet_name, rules_df in rules_to_export.items():
        if rules_df is not None and not rules_df.empty:
            # Sort the rules by lift and confidence and take the top 10
            rules_df_sorted = rules_df.sort_values(by=['lift', 'confidence'], ascending=False)
            rules_df_sorted.head(10)[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            # Create a simple DataFrame for segments with no rules
            no_rules_df = pd.DataFrame([['No rules found for this segment.']], columns=['Info'])
            no_rules_df.to_excel(writer, sheet_name=sheet_name, index=False)

print("\nTop 10 association rules for each category have been saved to 'Top_Association_Rules.xlsx'.")



Top 10 association rules for each category have been saved to 'Top_Association_Rules.xlsx'.


In [22]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Preparation and Segmentation ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean customer type for consistency
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]
final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()

# Filter for orders with at least two items to simulate a missing item
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()

# Segment the data by 'CUSTOMER_TYPE'
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']

# General fallback data includes all other customers (eClub, etc.)
main_types = ['registered', 'guest']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()

print(f"Data segmented into: Registered ({len(registered_data)}), Guest ({len(guest_data)}), General Fallback ({len(general_data)})")

# --- Step 2: Helper Functions for Training and Validation ---
def get_recommendations(cart_items, rules_df, k=3):
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

def calculate_recall_at_k(rules, test_transactions, k=3):
    correct_predictions = 0
    total_queries = 0
    for order in test_transactions:
        if len(order) < 2:
            continue
        missing_item = order[-1]
        simulated_cart = order[:-1]
        total_queries += 1
        recommendations = get_recommendations(simulated_cart, rules, k)
        if missing_item in recommendations:
            correct_predictions += 1
    recall_score = correct_predictions / total_queries if total_queries > 0 else 0
    return recall_score

def train_and_validate_segment(segment_df, segment_name, min_support, min_confidence):
    print(f"\n--- Training for {segment_name} segment ---")
    if len(segment_df) < 100:
        print("Not enough data to train. Skipping this segment.")
        return None, None

    transactions = segment_df['ITEMS_LIST'].tolist()
    train_transactions, test_transactions = train_test_split(transactions, test_size=0.3, random_state=42)

    te = TransactionEncoder()
    te_ary_train = te.fit(train_transactions).transform(train_transactions)
    df_one_hot_train = pd.DataFrame(te_ary_train, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot_train, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)

    print(f"Generated {len(rules)} rules for {segment_name}.")

    recall_score = calculate_recall_at_k(rules, test_transactions)
    print(f"Validation complete. Recall@3 for {segment_name} is: {recall_score:.4f}")
    return rules, recall_score

# --- Step 3: Run Training and Validation for Each Segment ---
# Use the optimal parameters we found earlier
min_support_large = 0.0001
min_support_small = 0.0035
min_confidence_val = 0.4

registered_rules, registered_score = train_and_validate_segment(registered_data, 'Registered', min_support=min_support_large, min_confidence=min_confidence_val)
guest_rules, guest_score = train_and_validate_segment(guest_data, 'Guest', min_support=min_support_large, min_confidence=min_confidence_val)
general_rules, general_score = train_and_validate_segment(general_data, 'General Fallback', min_support=min_support_small, min_confidence=min_confidence_val)

Data segmented into: Registered (665090), Guest (145919), General Fallback (1551)

--- Training for Registered segment ---
Generated 464 rules for Registered.
Validation complete. Recall@3 for Registered is: 0.1994

--- Training for Guest segment ---
Generated 478 rules for Guest.
Validation complete. Recall@3 for Guest is: 0.2081

--- Training for General Fallback segment ---
Generated 54 rules for General Fallback.
Validation complete. Recall@3 for General Fallback is: 0.1910


In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

# --- Step 1: Data Loading and Final Model Training ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found. Please ensure the file exists.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean relevant columns
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]

# FIX: Clean item names in the training data before creating the ITEMS_LIST
for col in item_name_columns:
    final_dataset[col] = final_dataset[col].str.lower().str.strip()

final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()
final_dataset['ORDER_CREATED_DATE'] = pd.to_datetime(final_dataset['ORDER_CREATED_DATE'])
final_dataset['ORDER_OCCASION_NAME'] = final_dataset['ORDER_OCCASION_NAME'].str.lower()

# Filter for orders with at least two items
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()


# --- Helper Functions for Training and Prediction ---
def train_segment_model(segment_df, min_support, min_confidence):
    """Trains a model and returns the rules DataFrame."""
    if len(segment_df) < 100:
        return None
    
    transactions = segment_df['ITEMS_LIST'].tolist()
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)
    return rules

def get_recommendations(cart_items, rules_df, k=3):
    """Generates top-k recommendations based on association rules."""
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

# Store the final rules in a dictionary
final_rules_dict = {}
min_support_large = 0.0005
min_support_small = 0.0035
min_confidence_val = 0.4

# Segment the data and train the final models on the entire segments
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']
main_types = ['registered', 'guest']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()


reg_togo_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'togo']
reg_del_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'delivery']

guest_togo_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'togo']
guest_del_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'delivery']


final_rules_dict['registered_togo'] = train_segment_model(reg_togo_data, min_support=min_support_large, min_confidence=min_confidence_val)
final_rules_dict['registered_delivery'] = train_segment_model(reg_del_data, min_support=min_support_large, min_confidence=min_confidence_val)
final_rules_dict['guest_togo'] = train_segment_model(guest_togo_data, min_support=min_support_large, min_confidence=min_confidence_val)
final_rules_dict['guest_delivery'] = train_segment_model(guest_del_data, min_support=min_support_large, min_confidence=min_confidence_val)
final_rules_dict['general'] = train_segment_model(general_data, min_support=min_support_small, min_confidence=min_confidence_val)


print("Final models trained for all segments. Ready for predictions.")


# --- Step 2: Prediction on the Test Data ---
try:
    test_data_question = pd.read_csv('test_data_question.csv')
except FileNotFoundError:
    print("Error: 'test_data_question.csv' not found. Please ensure the file is in the directory.")
    exit()

# Prepare the test data by creating a list of items in each cart
item_columns_test = [col for col in test_data_question.columns if 'item' in col.lower()]
test_data_question['cart_items'] = test_data_question[item_columns_test].apply(
    lambda row: [str(item).lower().strip() for item in row.dropna()], axis=1
)

# Clean the customer type and order occasion columns directly from the test data
test_data_question['CUSTOMER_TYPE'] = test_data_question['CUSTOMER_TYPE'].str.lower()
test_data_question['ORDER_OCCASION_NAME'] = test_data_question['ORDER_OCCASION_NAME'].str.lower()
test_data_question['CUSTOMER_TYPE'] = test_data_question['CUSTOMER_TYPE'].fillna('unknown')


# --- Step 3: Generate Recommendations based on Segments ---
def get_segmented_recommendations(row, rules_dict):
    customer_type = str(row['CUSTOMER_TYPE']).lower()
    order_occasion = str(row['ORDER_OCCASION_NAME']).lower()
    cart_items = row['cart_items']
    
    # Try to get rules for the specific segment
    key = f"{customer_type}_{order_occasion}"
    rules = rules_dict.get(key)
    
    # If no specific rules found, use the general fallback model
    if rules is None:
        rules = rules_dict.get('general')
        
    if rules is not None:
        return get_recommendations(cart_items, rules)
    else:
        return [None, None, None]

test_data_question['recommendations'] = test_data_question.apply(get_segmented_recommendations, axis=1, rules_dict=final_rules_dict)


# --- Step 4: Final Output Generation ---
submission_df = test_data_question[['CUSTOMER_ID', 'ORDER_ID']].copy()
for i in range(3):
    submission_df[f'RECOMMENDATION_{i+1}'] = test_data_question['recommendations'].apply(
        lambda x: x[i] if len(x) > i else None
    )

# Save the DataFrame to an Excel file
submission_df.to_excel('YourTeamName_Recommendation Output Sheet.xlsx', index=False)
print("\nFinal submission file 'YourTeamName_Recommendation Output Sheet.xlsx' has been created successfully.")


Final models trained for all segments. Ready for predictions.


  right=ast.Str(s=sentinel),
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  re

PermissionError: [Errno 13] Permission denied: 'YourTeamName_Recommendation Output Sheet.xlsx'

In [4]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# --- Step 1: Data Loading and Final Model Training ---
try:
    final_dataset = pd.read_csv('final_dataset_with_all_features.csv', low_memory=False)
except FileNotFoundError:
    print("Error: 'final_dataset_with_all_features.csv' not found. Please ensure the file exists.")
    exit()

# Recreate the 'ITEMS_LIST' column and clean relevant columns
item_name_columns = [col for col in final_dataset.columns if 'item' in col.lower() and 'name' in col.lower()]

# Clean item names in the training data before creating the ITEMS_LIST
for col in item_name_columns:
    final_dataset[col] = final_dataset[col].str.lower().str.strip()

final_dataset['ITEMS_LIST'] = final_dataset[item_name_columns].apply(
    lambda row: [item for item in row.dropna()], axis=1
)
final_dataset['CUSTOMER_TYPE'] = final_dataset['CUSTOMER_TYPE'].str.lower()
final_dataset['ORDER_CREATED_DATE'] = pd.to_datetime(final_dataset['ORDER_CREATED_DATE'])
final_dataset['ORDER_OCCASION_NAME'] = final_dataset['ORDER_OCCASION_NAME'].str.lower()

# Filter for orders with at least two items
final_dataset_filtered = final_dataset[final_dataset['ITEMS_LIST'].str.len() > 1].copy()


# --- Helper Functions for Training and Prediction ---
def train_segment_model(segment_df, min_support, min_confidence):
    """Trains a model and returns the rules DataFrame."""
    if len(segment_df) < 100:
        return None
    
    transactions = segment_df['ITEMS_LIST'].tolist()
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)

    frequent_itemsets = apriori(df_one_hot, min_support=min_support, max_len=3, low_memory=True, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules.sort_values(by=['lift', 'confidence'], ascending=False, inplace=True)
    return rules

def get_recommendations(cart_items, rules_df, k=3):
    """Generates top-k recommendations based on association rules."""
    recommendations = set()
    for item in cart_items:
        matching_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]
        if not matching_rules.empty:
            for _, rule_row in matching_rules.iterrows():
                consequent_items = list(rule_row['consequents'])
                for rec_item in consequent_items:
                    if rec_item not in cart_items:
                        recommendations.add(rec_item)
                        if len(recommendations) >= k:
                            return list(recommendations)
    return list(recommendations)[:k]

# Store the final rules in a dictionary
final_rules_dict = {}
min_support_large = 0.0001
min_support_small = 0.0035
min_confidence_val = 0.4

# Segment the data and train the final models on the entire segments
registered_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'registered']
guest_data = final_dataset_filtered[final_dataset_filtered['CUSTOMER_TYPE'] == 'guest']
main_types = ['registered', 'guest']
general_data = final_dataset_filtered[~final_dataset_filtered['CUSTOMER_TYPE'].isin(main_types)].copy()


reg_togo_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'togo']
reg_del_data = registered_data[registered_data['ORDER_OCCASION_NAME'] == 'delivery']

guest_togo_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'togo']
guest_del_data = guest_data[guest_data['ORDER_OCCASION_NAME'] == 'delivery']


final_rules_dict['registered_togo'] = train_segment_model(reg_togo_data, min_support=min_support_large, min_confidence=min_confidence_val)
final_rules_dict['registered_delivery'] = train_segment_model(reg_del_data, min_support=min_support_large, min_confidence=min_confidence_val)
final_rules_dict['guest_togo'] = train_segment_model(guest_togo_data, min_support=min_support_large, min_confidence=min_confidence_val)
final_rules_dict['guest_delivery'] = train_segment_model(guest_del_data, min_support=min_support_small, min_confidence=min_confidence_val)
final_rules_dict['general'] = train_segment_model(general_data, min_support=min_support_small, min_confidence=min_confidence_val)


print("Final models trained for all segments. Ready for predictions.")


# --- Step 2: Prediction on the Test Data ---
try:
    test_data_question = pd.read_csv('test_data_question.csv')
except FileNotFoundError:
    print("Error: 'test_data_question.csv' not found. Please ensure the file is in the directory.")
    exit()

# Prepare the test data by creating a list of items in each cart
item_columns_test = [col for col in test_data_question.columns if 'item' in col.lower()]
test_data_question['cart_items'] = test_data_question[item_columns_test].apply(
    lambda row: [str(item).lower().strip() for item in row.dropna()], axis=1
)

# Clean the customer type and order occasion columns directly from the test data
test_data_question['CUSTOMER_TYPE'] = test_data_question['CUSTOMER_TYPE'].str.lower()
test_data_question['ORDER_OCCASION_NAME'] = test_data_question['ORDER_OCCASION_NAME'].str.lower()
test_data_question['CUSTOMER_TYPE'] = test_data_question['CUSTOMER_TYPE'].fillna('unknown')


# --- Step 3: Generate Recommendations based on Segments ---
def get_segmented_recommendations(row, rules_dict):
    customer_type = str(row['CUSTOMER_TYPE']).lower()
    order_occasion = str(row['ORDER_OCCASION_NAME']).lower()
    cart_items = row['cart_items']
    
    # Try to get rules for the specific segment
    key = f"{customer_type}_{order_occasion}"
    rules = rules_dict.get(key)
    
    # If no specific rules found, use the general fallback model
    if rules is None:
        rules = rules_dict.get('general')
        
    if rules is not None:
        return get_recommendations(cart_items, rules)
    else:
        return [None, None, None]

test_data_question['recommendations'] = test_data_question.apply(get_segmented_recommendations, axis=1, rules_dict=final_rules_dict)


# --- Step 4: Final Output Generation ---
submission_df = test_data_question[['CUSTOMER_ID', 'ORDER_ID']].copy()
for i in range(3):
    submission_df[f'RECOMMENDATION_{i+1}'] = test_data_question['recommendations'].apply(
        lambda x: x[i] if len(x) > i else None
    )

# Save the DataFrame to an Excel file
submission_df.to_excel('YourTeamName_Recommendation Output Sheet.xlsx', index=False)
print("\nFinal submission file 'YourTeamName_Recommendation Output Sheet.xlsx' has been created successfully.")


Final models trained for all segments. Ready for predictions.

Final submission file 'YourTeamName_Recommendation Output Sheet.xlsx' has been created successfully.
