In [89]:
import pandas as pd
import polars as pl

In [90]:
pl.Config.set_tbl_cols(300)

polars.config.Config

In [91]:
pl.Config.set_tbl_rows(20)

polars.config.Config

In [93]:
raw_df = pl.read_csv(
    'source/order_history_kaggle_data.csv',
    infer_schema_length=10000,
)

df = raw_df.clone()

print(len(df))
df.head()

21321


Restaurant ID,Restaurant name,Subzone,City,Order ID,Order Placed At,Order Status,Delivery,Distance,Items in order,Instructions,Discount construct,Bill subtotal,Packaging charges,Restaurant discount (Promo),"Restaurant discount (Flat offs, Freebies & others)",Gold discount,Brand pack discount,Total,Rating,Review,Cancellation / Rejection reason,Restaurant compensation (Cancellation),Restaurant penalty (Rejection),KPT duration (minutes),Rider wait time (minutes),Order Ready Marked,Customer complaint tag,Customer ID
i64,str,str,str,i64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,str
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6168884918,"""11:38 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""3km""","""1 x Grilled Chicken Jamaican T…",,"""40% off upto Rs.80""",715.0,31.75,80.0,0.0,0.0,0.0,666.75,,,,,,18.35,11.6,"""Correctly""",,"""5d6c2b96db963098bc69768bea504c…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6170707559,"""11:34 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Fries, 1 x Fried…",,"""Flat Rs.175 off""",1179.0,50.2,175.0,0.0,0.0,0.0,1054.2,,,,,,16.95,3.6,"""Correctly""",,"""0781815deb4a10a574e9fee4fa0b86…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6169375019,"""03:52 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""<1km""","""1 x Bone in Peri Peri Grilled …",,"""40% off upto Rs.80""",310.0,11.5,80.0,0.0,0.0,0.0,241.5,,,,,,14.05,12.2,"""Correctly""",,"""f93362f5ce5382657482d164e36818…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6151677434,"""03:45 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Fried Chicken Ghostbuster …",,"""40% off upto Rs.80""",620.0,27.0,80.0,0.0,0.0,0.0,567.0,4.0,,,,,19.0,3.3,"""Correctly""",,"""1ed226d1b8a5f7acee12fc1d667655…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6167540897,"""03:04 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Krispers, 1 x Fr…",,"""40% off upto Rs.80""",584.0,25.2,80.0,0.0,0.0,0.0,529.2,,,,,,15.97,1.0,"""Correctly""",,"""d21a2ac6ea06b31cc3288ab20c4ef2…"


Run a check on column data types - are the types being loaded correctly? 

In [94]:
df_schema = dict(df.schema)
df_schema

{'Restaurant ID': Int64,
 'Restaurant name': String,
 'Subzone': String,
 'City': String,
 'Order ID': Int64,
 'Order Placed At': String,
 'Order Status': String,
 'Delivery': String,
 'Distance': String,
 'Items in order': String,
 'Instructions': String,
 'Discount construct': String,
 'Bill subtotal': Float64,
 'Packaging charges': Float64,
 'Restaurant discount (Promo)': Float64,
 'Restaurant discount (Flat offs, Freebies & others)': Float64,
 'Gold discount': Float64,
 'Brand pack discount': Float64,
 'Total': Float64,
 'Rating': Float64,
 'Review': String,
 'Cancellation / Rejection reason': String,
 'Restaurant compensation (Cancellation)': Float64,
 'Restaurant penalty (Rejection)': Float64,
 'KPT duration (minutes)': Float64,
 'Rider wait time (minutes)': Float64,
 'Order Ready Marked': String,
 'Customer complaint tag': String,
 'Customer ID': String}

Initial observations:
- ID columns ('Restaurant ID', 'Order ID') come as large int objects, Customer ID is a hash string
  - Assert all values are not null & unique
  - Can certain IDs with leading zeroes be getting removed? (Thus, should column be read as string?)
- 'Order Placed At' parsed as string (find a way to convert it to timestamp)
- 'Delivery' + 'Cancellation / Rejection reason' + 'Order Ready Marked' seem to be dropdown strings
  - See examples of values
- 'Distance' parsed as string (**convert it to to float**)
- Billing + Discount columns all come in correctly as floating-point numbers
- 'Rating' is a float (find min, max values)
- 'Instructions', 'Review', 'Customer complaint tag' are strings (see a few examples of types of values + note value length)
  -  Are columns needed for future storage? Replace values with placeholder (e.g. 1s and 0s)?
- 'Restaurant Penalty' is a string (what does the column represent? see a few examples)
- KPT duration + Rider wait time coming in correctly as floating-point numbers

Other notes: 
1. Split df into different parts (multiple business cases in one df)? 
1. Decide what business question(s) to answer using df

In [95]:
df.describe()

statistic,Restaurant ID,Restaurant name,Subzone,City,Order ID,Order Placed At,Order Status,Delivery,Distance,Items in order,Instructions,Discount construct,Bill subtotal,Packaging charges,Restaurant discount (Promo),"Restaurant discount (Flat offs, Freebies & others)",Gold discount,Brand pack discount,Total,Rating,Review,Cancellation / Rejection reason,Restaurant compensation (Cancellation),Restaurant penalty (Rejection),KPT duration (minutes),Rider wait time (minutes),Order Ready Marked,Customer complaint tag,Customer ID
str,f64,str,str,str,f64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,str
"""count""",21321.0,"""21321""","""21321""","""21321""",21321.0,"""21321""","""21321""","""21321""","""21321""","""21321""","""720""","""15823""",21321.0,21321.0,21321.0,21321.0,21321.0,21321.0,21321.0,2491.0,"""296""","""186""",133.0,3.0,21026.0,21153.0,"""21321""","""469""","""21321"""
"""null_count""",0.0,"""0""","""0""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""20601""","""5498""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,18830.0,"""21025""","""21135""",21188.0,21318.0,295.0,168.0,"""0""","""20852""","""0"""
"""mean""",20744000.0,,,,6354600000.0,,,,,,,,750.076838,32.564592,65.091816,31.795058,0.099128,3.039324,682.616113,4.356885,,,356.409549,0.0,17.33296,4.82507,,,
"""std""",244719.265249,,,,123030000.0,,,,,,,,498.759428,22.235898,85.401604,131.487091,3.264261,17.07078,465.313977,1.181472,,,328.12824,0.0,6.283388,4.982591,,,
"""min""",20320607.0,"""Aura Pizzas""","""Chittaranjan Park""","""Delhi NCR""",6086800000.0,"""01:00 AM, December 08 2024""","""Delivered""","""Zomato Delivery""","""10km""","""1 x AAC Fried Chicken Burger, …","""Order: 1. Send extra dips. 2. …","""20% off upto Rs.50""",50.0,0.0,0.0,0.0,0.0,0.0,52.5,1.0,"""Absolutely amazing food.. tast…","""Cancelled by Customer""",83.58,0.0,0.0,0.1,"""Correctly""","""Item(s) missing or not deliver…","""000285ae83ecf06a92b936d4f5b743…"
"""25%""",20635699.0,,,,6250800000.0,,,,,,,,459.0,18.45,0.0,0.0,0.0,0.0,387.45,4.0,,,191.95,0.0,13.38,1.0,,,
"""50%""",20659868.0,,,,6357700000.0,,,,,,,,629.0,28.45,80.0,0.0,0.0,0.0,597.45,5.0,,,272.58,0.0,16.33,3.1,,,
"""75%""",20882652.0,,,,6456800000.0,,,,,,,,899.0,39.95,100.0,0.0,0.0,0.0,837.9,5.0,,,397.84,0.0,20.05,7.4,,,
"""max""",21523055.0,"""The Chicken Junction""","""Vasant Kunj""","""Delhi NCR""",6573400000.0,"""12:59 PM, October 30 2024""","""Timed out""","""Zomato Delivery""","""<1km""","""8 x Tipsy Tiger Ginger Ale, 2 …","""Order: will pick up from the r…","""₹ 0.00""",16080.0,603.0,4020.0,7787.0,280.1,554.8,12663.0,5.0,"""yummiest pizza i have eaten in…","""Merchant device issue""",3236.98,0.0,90.87,73.8,"""Missed""","""Wrong item(s) delivered""","""fff9f622424297b6dc5a5a66725896…"


### Goal: Build an ML modeling pipeline to train a regressor to predict the KPT duration (kitchen preparation time)

#### 1. Quantify count and percentage of nulls in each column

In [96]:
df_length = len(df)

nulls_df = df.null_count().unpivot(
    variable_name='column_name',
    value_name='null_count'
).with_columns(
    (pl.col('null_count') / df_length).round(4).alias('null_percentage')
)

print(len(nulls_df))
nulls_df.head()

29


column_name,null_count,null_percentage
str,u32,f64
"""Restaurant ID""",0,0.0
"""Restaurant name""",0,0.0
"""Subzone""",0,0.0
"""City""",0,0.0
"""Order ID""",0,0.0


In [97]:
null_metric = 0.1 # To discuss with Steven what is the most optimal percentage metric to assess nulls

highly_null_fields = set(nulls_df.filter(pl.col('null_percentage') > 0.5)['column_name'])

nulls_df.filter(
    pl.col('null_percentage') > null_metric
)

column_name,null_count,null_percentage
str,u32,f64
"""Instructions""",20601,0.9662
"""Discount construct""",5498,0.2579
"""Rating""",18830,0.8832
"""Review""",21025,0.9861
"""Cancellation / Rejection reaso…",21135,0.9913
"""Restaurant compensation (Cance…",21188,0.9938
"""Restaurant penalty (Rejection)""",21318,0.9999
"""Customer complaint tag""",20852,0.978


In [98]:
# See if ID columns have any null values
nulls_df.filter(
    pl.col('column_name').str.contains('ID')
)

column_name,null_count,null_percentage
str,u32,f64
"""Restaurant ID""",0,0.0
"""Order ID""",0,0.0
"""Customer ID""",0,0.0


In [99]:
# Create an assert statement on ID columns not having any nulls (potential for future use if new source files come in)
assert nulls_df.filter(pl.col('column_name').str.contains('ID'))['null_count'].sum() == 0

In [100]:
null_fields = set(nulls_df.filter(pl.col('null_count') > 0)['column_name'])

nulls_df.filter(
    pl.col('null_count') > 0
)

# Notes: 
# - KPT duration (target variable) contains null in 1.4% of cases
# - Ride wait time (possible classifier?) contains nulls in 0.8% of cases

column_name,null_count,null_percentage
str,u32,f64
"""Instructions""",20601,0.9662
"""Discount construct""",5498,0.2579
"""Rating""",18830,0.8832
"""Review""",21025,0.9861
"""Cancellation / Rejection reaso…",21135,0.9913
"""Restaurant compensation (Cance…",21188,0.9938
"""Restaurant penalty (Rejection)""",21318,0.9999
"""KPT duration (minutes)""",295,0.0138
"""Rider wait time (minutes)""",168,0.0079
"""Customer complaint tag""",20852,0.978


In [101]:
# Filter on records where KPT duration is null
target_null_df = df.filter(
    pl.col('KPT duration (minutes)').is_null()
)

print(len(target_null_df))
target_null_df.head()

295


Restaurant ID,Restaurant name,Subzone,City,Order ID,Order Placed At,Order Status,Delivery,Distance,Items in order,Instructions,Discount construct,Bill subtotal,Packaging charges,Restaurant discount (Promo),"Restaurant discount (Flat offs, Freebies & others)",Gold discount,Brand pack discount,Total,Rating,Review,Cancellation / Rejection reason,Restaurant compensation (Cancellation),Restaurant penalty (Rejection),KPT duration (minutes),Rider wait time (minutes),Order Ready Marked,Customer complaint tag,Customer ID
i64,str,str,str,i64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,str
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6160420881,"""08:35 PM, September 07 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Bone in Jamaican Grilled C…",,"""40% off upto Rs.80""",310.0,11.5,80.0,0.0,0.0,0.0,241.5,,,,,,,3.2,"""Missed""",,"""31e6e136219f784a75bb09e36e5ed5…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6148418552,"""11:27 PM, September 04 2024""","""Delivered""","""Zomato Delivery""","""1km""","""1 x Bone in Jamaican Grilled C…",,,745.0,21.75,0.0,310.0,0.0,0.0,456.75,,,,,,,5.6,"""Missed""",,"""096a9f7a0c5cf71b57729f2de20cf0…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6137221693,"""07:31 PM, September 01 2024""","""Delivered""","""Zomato Delivery""","""6km""","""1 x Bone in Smoky Bbq Grilled …",,"""Flat Rs.175 off""",1790.0,80.75,175.0,0.0,0.0,0.0,1695.75,,,,,,,16.6,"""Missed""",,"""e33afa09633ab31d30c14d93c78529…"
20554001,"""Swaad""","""Greater Kailash 2 (GK2)""","""Delhi NCR""",6165976485,"""12:43 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Bone in Smoky Bbq Grilled …",,"""50% off upto Rs.100""",420.0,16.0,100.0,0.0,0.0,0.0,336.0,,,,,,,5.9,"""Missed""",,"""22ef48e9b486ef7ce92cb80ad133cf…"
20554001,"""Swaad""","""Greater Kailash 2 (GK2)""","""Delhi NCR""",6151212614,"""01:26 PM, September 04 2024""","""Delivered""","""Zomato Delivery""","""5km""","""1 x Bone in Jamaican Grilled C…",,,2090.0,52.25,0.0,1045.0,0.0,0.0,1097.25,5.0,,,,,,9.5,"""Missed""",,"""feaeeaa1d9d90bdbdd0ac0261a39cb…"


In [102]:
# Find fields that could be the reason for why null values are coming in for target variable
target_null_df.select(
    pl.all().n_unique()
).unpivot(
    variable_name='column_name',
    value_name='unique_count'
).filter(
    pl.col('unique_count') == 1
)

column_name,unique_count
str,u32
"""City""",1
"""Delivery""",1
"""KPT duration (minutes)""",1
"""Order Ready Marked""",1


In [103]:
target_null_assess_fields = set(target_null_df.select(
    pl.all().n_unique()
).unpivot(
    variable_name='column_name',
    value_name='unique_count'
).filter(
    pl.col('unique_count') == 1
)['column_name'])

target_null_df.select(target_null_assess_fields).unique()

# Notes:
# - Order Ready Marked is 'Missed' for all values

Order Ready Marked,Delivery,KPT duration (minutes),City
str,str,f64,str
"""Missed""","""Zomato Delivery""",,"""Delhi NCR"""


In [104]:
df.filter(
    pl.col('Order Ready Marked') == 'Missed'
).select(
    pl.col('KPT duration (minutes)').n_unique()
)

# Notes:
# - Wherever KPT duration is null then Order Ready Marked = 'Missed'
# - However when Order Ready Marked = 'Missed', many values for KPT duration
# - (one to many relationship)

KPT duration (minutes)
u32
44


#### 2. Standardize column names

In [105]:
df.columns

['Restaurant ID',
 'Restaurant name',
 'Subzone',
 'City',
 'Order ID',
 'Order Placed At',
 'Order Status',
 'Delivery',
 'Distance',
 'Items in order',
 'Instructions',
 'Discount construct',
 'Bill subtotal',
 'Packaging charges',
 'Restaurant discount (Promo)',
 'Restaurant discount (Flat offs, Freebies & others)',
 'Gold discount',
 'Brand pack discount',
 'Total',
 'Rating',
 'Review',
 'Cancellation / Rejection reason',
 'Restaurant compensation (Cancellation)',
 'Restaurant penalty (Rejection)',
 'KPT duration (minutes)',
 'Rider wait time (minutes)',
 'Order Ready Marked',
 'Customer complaint tag',
 'Customer ID']

In [106]:
df_cols_standardized = []
for col in df.columns:
    col_standardized = col.replace('(', '').replace(')', '').replace('/', '').lower().replace(' ', '_')

    # print(col_standardized)

    # Manually adjust columns:
    if col_standardized == 'cancellation__rejection_reason':
        col_standardized = 'cancellation_rejection_reason'
    elif col_standardized == 'restaurant_discount_flat_offs,_freebies_&_others':
        col_standardized = 'restaurant_discount_others'

    df_cols_standardized.append(col_standardized)

df.columns = df_cols_standardized

print(len(df))
df.head()

21321


restaurant_id,restaurant_name,subzone,city,order_id,order_placed_at,order_status,delivery,distance,items_in_order,instructions,discount_construct,bill_subtotal,packaging_charges,restaurant_discount_promo,restaurant_discount_others,gold_discount,brand_pack_discount,total,rating,review,cancellation_rejection_reason,restaurant_compensation_cancellation,restaurant_penalty_rejection,kpt_duration_minutes,rider_wait_time_minutes,order_ready_marked,customer_complaint_tag,customer_id
i64,str,str,str,i64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,str
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6168884918,"""11:38 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""3km""","""1 x Grilled Chicken Jamaican T…",,"""40% off upto Rs.80""",715.0,31.75,80.0,0.0,0.0,0.0,666.75,,,,,,18.35,11.6,"""Correctly""",,"""5d6c2b96db963098bc69768bea504c…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6170707559,"""11:34 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Fries, 1 x Fried…",,"""Flat Rs.175 off""",1179.0,50.2,175.0,0.0,0.0,0.0,1054.2,,,,,,16.95,3.6,"""Correctly""",,"""0781815deb4a10a574e9fee4fa0b86…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6169375019,"""03:52 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""<1km""","""1 x Bone in Peri Peri Grilled …",,"""40% off upto Rs.80""",310.0,11.5,80.0,0.0,0.0,0.0,241.5,,,,,,14.05,12.2,"""Correctly""",,"""f93362f5ce5382657482d164e36818…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6151677434,"""03:45 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Fried Chicken Ghostbuster …",,"""40% off upto Rs.80""",620.0,27.0,80.0,0.0,0.0,0.0,567.0,4.0,,,,,19.0,3.3,"""Correctly""",,"""1ed226d1b8a5f7acee12fc1d667655…"
20320607,"""Swaad""","""Sector 4""","""Delhi NCR""",6167540897,"""03:04 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Krispers, 1 x Fr…",,"""40% off upto Rs.80""",584.0,25.2,80.0,0.0,0.0,0.0,529.2,,,,,,15.97,1.0,"""Correctly""",,"""d21a2ac6ea06b31cc3288ab20c4ef2…"


#### 3. Ensure each datatype is consistent with expectation

In [107]:
dict(df.schema)

# Notes:
# - Id columns should all be string 
#   -> (only convert to string, be mindful if downstream stakeholders say that the source data should not be initially parsed as int)
#   -> (e.g. '01' ID is different than '1', in which case the initial read_csv() line should be altered)
# - order_placed_at should be converted to datetime / timestamp
# - distance should be converted to float
# - items_in_order to be converted to a list of tuples -> [('Grilled Chicken', 1), ('Peri Peri Fries', 1)] -> pushed to later as mentioned in task 6
# - order_ready_marked to be one-hot encoded

{'restaurant_id': Int64,
 'restaurant_name': String,
 'subzone': String,
 'city': String,
 'order_id': Int64,
 'order_placed_at': String,
 'order_status': String,
 'delivery': String,
 'distance': String,
 'items_in_order': String,
 'instructions': String,
 'discount_construct': String,
 'bill_subtotal': Float64,
 'packaging_charges': Float64,
 'restaurant_discount_promo': Float64,
 'restaurant_discount_others': Float64,
 'gold_discount': Float64,
 'brand_pack_discount': Float64,
 'total': Float64,
 'rating': Float64,
 'review': String,
 'cancellation_rejection_reason': String,
 'restaurant_compensation_cancellation': Float64,
 'restaurant_penalty_rejection': Float64,
 'kpt_duration_minutes': Float64,
 'rider_wait_time_minutes': Float64,
 'order_ready_marked': String,
 'customer_complaint_tag': String,
 'customer_id': String}

In [160]:
set(df.filter(pl.col('order_status').is_not_null())['order_status'])

{'Delivered',
 'Picked up',
 'Rejected',
 'Return cancelled',
 'Returned',
 'Timed out'}

In [108]:
set(df.filter(pl.col('order_ready_marked').is_not_null())['order_ready_marked'])

{'Correctly', 'Incorrectly', 'Missed'}

In [109]:
set(df.filter(pl.col('cancellation_rejection_reason').is_not_null())['cancellation_rejection_reason'])

{'Cancelled by Customer',
 'Cancelled by Zomato',
 'Items out of stock',
 'Kitchen is full',
 'Merchant device issue'}

In [110]:
set(df.filter(pl.col('review').is_not_null())['review'])

{'Absolutely amazing food.. taste was mind-blowing... ordered for the 1st time but definitely not the last time... loved it',
 'Always a delight',
 'Always a delight to order from this place',
 'Amazing',
 'Amazing flavours',
 'Amazing food and quality always , Highly recommend to all chicken lovers , every dish is delicious and the sauces are also amazing',
 'Amazing food. Becoming a regular. Keep it up!',
 'Amazing pizza',
 'Amazing pizza, really liked the food',
 'Amazing pizzas',
 'Amazing work and taste.',
 'Animal fries are fireee but the burger was soooo disappointing since it felt like a 50rs burger from a street vendor that had little bones in between.',
 'As excellent as always. Great to see quality of food being maintained',
 'Awesome base great taste',
 'Awesome thanks',
 'Bestt wings ever!!!',
 'Both the Pizza and the Melt had a very weird cardboard like taste to it. Felt as if something is wrong with the dough. I have been a regular customer but was very disappointed with

In [111]:
set(df.filter(pl.col('instructions').is_not_null())['instructions'])

{'Order: 1. Send extra dips.\n2. Send extra Napkins.',
 'Order: 402 Savitri ke sir. Extra angara sauce dressing side Rakho, khud pick kar lunga aur bahut kam spicy',
 'Order: 7 packs of origanno and chilly flakes. Extra cheese. LESS SPICY. send extra cutlery',
 'Order: Absolutely no cheese please',
 'Order: Add ample marinara sauce in the pizza, it was very dry last time. Also, please add extra peri peri',
 'Order: Add extra bbq and peri peri sauce',
 'Order: Add extra broccoli and paneer',
 'Order: Add extra dips',
 'Order: Add extra onions and capsicums. please',
 'Order: Add extra peri peri masala on top',
 "Order: Add extra seekh. Everything should be fresh. Add all the dips otherwise I won't accept the order.",
 'Order: Add extra vinegar onions',
 'Order: Add extra vinegar onions and sliced onions',
 'Order: Add for fies extra and cheese dip (ordered 100times',
 'Order: Add jalapeños nicely',
 'Order: Add large pieces',
 'Order: Add lots of vegetables and broccoli in pizza and mak

In [125]:
id_columns = {x for x in df.columns if '_id' in x}

# Re-create the df with each id column cast as string
df_recast_ids = df.clone()

for column in id_columns:
    df_recast_ids = df_recast_ids.with_columns(
        pl.col(column).cast(pl.Utf8).alias(column)
    )

assert len(df_recast_ids) == len(df)

print(len(df_recast_ids))
df_recast_ids.head()

21321


restaurant_id,restaurant_name,subzone,city,order_id,order_placed_at,order_status,delivery,distance,items_in_order,instructions,discount_construct,bill_subtotal,packaging_charges,restaurant_discount_promo,restaurant_discount_others,gold_discount,brand_pack_discount,total,rating,review,cancellation_rejection_reason,restaurant_compensation_cancellation,restaurant_penalty_rejection,kpt_duration_minutes,rider_wait_time_minutes,order_ready_marked,customer_complaint_tag,customer_id
str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,str
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6168884918""","""11:38 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""3km""","""1 x Grilled Chicken Jamaican T…",,"""40% off upto Rs.80""",715.0,31.75,80.0,0.0,0.0,0.0,666.75,,,,,,18.35,11.6,"""Correctly""",,"""5d6c2b96db963098bc69768bea504c…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6170707559""","""11:34 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Fries, 1 x Fried…",,"""Flat Rs.175 off""",1179.0,50.2,175.0,0.0,0.0,0.0,1054.2,,,,,,16.95,3.6,"""Correctly""",,"""0781815deb4a10a574e9fee4fa0b86…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6169375019""","""03:52 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""<1km""","""1 x Bone in Peri Peri Grilled …",,"""40% off upto Rs.80""",310.0,11.5,80.0,0.0,0.0,0.0,241.5,,,,,,14.05,12.2,"""Correctly""",,"""f93362f5ce5382657482d164e36818…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6151677434""","""03:45 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Fried Chicken Ghostbuster …",,"""40% off upto Rs.80""",620.0,27.0,80.0,0.0,0.0,0.0,567.0,4.0,,,,,19.0,3.3,"""Correctly""",,"""1ed226d1b8a5f7acee12fc1d667655…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6167540897""","""03:04 PM, September 10 2024""","""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Krispers, 1 x Fr…",,"""40% off upto Rs.80""",584.0,25.2,80.0,0.0,0.0,0.0,529.2,,,,,,15.97,1.0,"""Correctly""",,"""d21a2ac6ea06b31cc3288ab20c4ef2…"


In [126]:
set(df_recast_ids['order_placed_at'])

{'07:36 PM, December 13 2024',
 '10:01 PM, September 26 2024',
 '04:09 PM, December 08 2024',
 '12:12 AM, September 07 2024',
 '02:46 AM, January 18 2025',
 '09:51 PM, September 24 2024',
 '07:20 PM, October 02 2024',
 '09:24 PM, October 04 2024',
 '09:08 PM, October 02 2024',
 '07:17 PM, October 31 2024',
 '01:09 PM, November 26 2024',
 '01:47 PM, October 03 2024',
 '08:56 PM, October 29 2024',
 '09:53 PM, January 18 2025',
 '03:35 PM, November 05 2024',
 '08:48 PM, December 28 2024',
 '05:54 PM, January 20 2025',
 '02:54 AM, October 19 2024',
 '10:21 PM, January 17 2025',
 '08:02 PM, September 02 2024',
 '11:02 PM, November 22 2024',
 '01:22 PM, October 12 2024',
 '03:12 PM, October 13 2024',
 '12:31 AM, September 23 2024',
 '08:31 PM, October 19 2024',
 '07:16 PM, October 14 2024',
 '11:01 PM, December 27 2024',
 '07:45 PM, January 26 2025',
 '11:52 AM, December 14 2024',
 '08:10 PM, September 28 2024',
 '11:05 PM, October 20 2024',
 '08:00 PM, October 10 2024',
 '10:11 PM, October 

In [135]:
# Cast 'order_placed_at' to datetime
# Ask Steven if timezones should be considered 
df_recast_order_time = df_recast_ids.with_columns(
    pl.col('order_placed_at').str.strptime(pl.Datetime, format='%I:%M %p, %B %d %Y').alias('order_placed_at')
)

print(len(df_recast_order_time))
df_recast_order_time.head()

21321


restaurant_id,restaurant_name,subzone,city,order_id,order_placed_at,order_status,delivery,distance,items_in_order,instructions,discount_construct,bill_subtotal,packaging_charges,restaurant_discount_promo,restaurant_discount_others,gold_discount,brand_pack_discount,total,rating,review,cancellation_rejection_reason,restaurant_compensation_cancellation,restaurant_penalty_rejection,kpt_duration_minutes,rider_wait_time_minutes,order_ready_marked,customer_complaint_tag,customer_id
str,str,str,str,str,datetime[μs],str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,str
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6168884918""",2024-09-10 23:38:00,"""Delivered""","""Zomato Delivery""","""3km""","""1 x Grilled Chicken Jamaican T…",,"""40% off upto Rs.80""",715.0,31.75,80.0,0.0,0.0,0.0,666.75,,,,,,18.35,11.6,"""Correctly""",,"""5d6c2b96db963098bc69768bea504c…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6170707559""",2024-09-10 23:34:00,"""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Fries, 1 x Fried…",,"""Flat Rs.175 off""",1179.0,50.2,175.0,0.0,0.0,0.0,1054.2,,,,,,16.95,3.6,"""Correctly""",,"""0781815deb4a10a574e9fee4fa0b86…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6169375019""",2024-09-10 15:52:00,"""Delivered""","""Zomato Delivery""","""<1km""","""1 x Bone in Peri Peri Grilled …",,"""40% off upto Rs.80""",310.0,11.5,80.0,0.0,0.0,0.0,241.5,,,,,,14.05,12.2,"""Correctly""",,"""f93362f5ce5382657482d164e36818…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6151677434""",2024-09-10 15:45:00,"""Delivered""","""Zomato Delivery""","""2km""","""1 x Fried Chicken Ghostbuster …",,"""40% off upto Rs.80""",620.0,27.0,80.0,0.0,0.0,0.0,567.0,4.0,,,,,19.0,3.3,"""Correctly""",,"""1ed226d1b8a5f7acee12fc1d667655…"
"""20320607""","""Swaad""","""Sector 4""","""Delhi NCR""","""6167540897""",2024-09-10 15:04:00,"""Delivered""","""Zomato Delivery""","""2km""","""1 x Peri Peri Krispers, 1 x Fr…",,"""40% off upto Rs.80""",584.0,25.2,80.0,0.0,0.0,0.0,529.2,,,,,,15.97,1.0,"""Correctly""",,"""d21a2ac6ea06b31cc3288ab20c4ef2…"


In [137]:
# Assert re-casting of 'order_placed_at' field did not lead to any new nulls
assert df_recast_order_time.null_count()['order_placed_at'].item() == df_recast_ids.null_count()['order_placed_at'].item()

In [None]:
# Check if re-casting looks correct based on one example
print(df_recast_ids.filter(pl.col('order_id')=='6168884918')['order_placed_at'])

print(df_recast_order_time.filter(pl.col('order_id')=='6168884918')['order_placed_at'])

shape: (1,)
Series: 'order_placed_at' [str]
[
	"11:38 PM, September 10 2024"
]
shape: (1,)
Series: 'order_placed_at' [datetime[μs]]
[
	2024-09-10 23:38:00
]


In [151]:
set(df_recast_order_time['distance'])

{'10km',
 '11km',
 '12km',
 '13km',
 '14km',
 '15km',
 '16km',
 '17km',
 '18km',
 '19km',
 '1km',
 '20km',
 '21km',
 '2km',
 '3km',
 '4km',
 '5km',
 '6km',
 '7km',
 '8km',
 '9km',
 '<1km'}

In [159]:
# Remove the km from 'distance' field and re-cast as float
# '<1km' value will be changed to 0.5 (ask Steven his thoughts on this) 
df_recast = df_recast_order_time.with_columns(
    pl.when(
        pl.col('distance') == '<1km'
    ).then(
        pl.lit('0.5km')
    ).otherwise(
        pl.col('distance')
    ).str.replace('km', '').cast(pl.Float32).alias('distance_km')
)

assert len(df_recast) == len(df)
assert df_recast.null_count()['distance_km'].item() == df.null_count()['distance'].item()

print(len(df_recast))
df_recast.select(['distance', 'distance_km']).head()

21321


distance,distance_km
str,f32
"""3km""",3.0
"""2km""",2.0
"""<1km""",0.5
"""2km""",2.0
"""2km""",2.0


#### 4. Standardize the values in each column

In [214]:
# Find fields where only a few values exist (potentially to be one hot encoded) 

unique_vals_df = df_recast.select(
    pl.all().n_unique()
).unpivot(
    variable_name='column_name',
    value_name='unique_values'
)

null_vals_df = df_recast.select(
    pl.all().null_count()
).unpivot(
    variable_name='column_name',
    value_name='null_values'
).with_columns(
    pl.when(
        pl.col('null_values') == 0
    ).then(
        pl.col('null_values')
    ).otherwise(
        pl.lit(1)
    ).alias('unique_null_values')
)

assert len(null_vals_df) == len(unique_vals_df)

unique_vals_non_null_df = unique_vals_df.join(
    null_vals_df, 
    how='outer', 
    on='column_name', 
    coalesce=True
).with_columns(
    (pl.col('unique_values') - pl.col('unique_null_values')).alias('unique_values_non_null')
)

potential_encoded_fields = set(unique_vals_non_null_df.filter(pl.col('unique_values_non_null') <= 5)['column_name'])

unique_vals_non_null_df.filter(
    pl.col('unique_values_non_null') <= 5 # To discuss with Steven the most-optimal number here
)

# Notes:
# - Some fields need to be checked to see if they are ready to be one-hot encoded

  unique_vals_non_null_df = unique_vals_df.join(


column_name,unique_values,null_values,unique_null_values,unique_values_non_null
str,u32,u32,u32,u32
"""city""",1,0,0,1
"""delivery""",1,0,0,1
"""rating""",6,18830,1,5
"""cancellation_rejection_reason""",6,21135,1,5
"""restaurant_penalty_rejection""",2,21318,1,1
"""order_ready_marked""",3,0,0,3
"""customer_complaint_tag""",6,20852,1,5


In [213]:
for column in potential_encoded_fields:
    print(f'{column}: {list(df_recast.select(pl.col(column)).unique()[column])}')

# Notes: 
# - 'order_ready_marked' field looks best to be one-hot encoded as it contains fewest unique values and has no nulls

order_ready_marked: ['Incorrectly', 'Correctly', 'Missed']
cancellation_rejection_reason: [None, 'Cancelled by Customer', 'Cancelled by Zomato', 'Merchant device issue', 'Kitchen is full', 'Items out of stock']
customer_complaint_tag: ['Non-refunded complaint', 'Item(s) missing or not delivered', 'Poor packaging or spillage', 'Poor taste or quality', None, 'Wrong item(s) delivered']
city: ['Delhi NCR']
delivery: ['Zomato Delivery']
rating: [None, 3.0, 5.0, 2.0, 1.0, 4.0]
restaurant_penalty_rejection: [0.0, None]


In [208]:
# One-hot encode values from the 'order_ready_marked' field

df_encoded = df_recast.to_dummies('order_ready_marked')

assert len(df_encoded) == len(raw_df)

# Print newly generated columns
print(f'New columns generated: {set(df_encoded.columns) - set(df_recast.columns)}')

# Re-convert all columns to be lower-case after encoding
df_encoded.columns = [x.lower() for x in df_encoded.columns]

# Assert new encoded columns do not have any nulls (as base column did not contain nulls)
assert df_encoded.select(pl.col(column) for column in df_encoded.columns if 'order_ready_marked' in column).null_count().sum_horizontal().item() == 0

df_encoded.select(
    pl.col(column) for column in df_encoded.columns if 'order_ready_marked' in column
).head()

New columns generated: {'order_ready_marked_Correctly', 'order_ready_marked_Missed', 'order_ready_marked_Incorrectly'}


order_ready_marked_correctly,order_ready_marked_incorrectly,order_ready_marked_missed
u8,u8,u8
1,0,0
1,0,0
1,0,0
1,0,0
1,0,0
