In [25]:
!pip install pandas



In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl

Merges

In [27]:
# Function to assign tiers
def categorize_city(number_of_residents):
    if number_of_residents > 1000000:
        return 'Tier 1'
    elif number_of_residents > 400000:
        return 'Tier 2'
    elif number_of_residents > 100000:
        return 'Tier 3'
    else:
        return 'Tier 4'
    
# Read Population CSV from Wikipedia

# fix errors in data instead of on bad-lines-skip
df_city = pd.read_csv('./data/population.csv', delimiter=",", usecols=range(10))

# Extract necessary data
df_city = df_city[['Name', '2022', 'Bundesland']]
df_city['2022'] = df_city['2022'].str.replace('.', '')
df_city['2022'] = pd.to_numeric(df_city['2022'], errors='coerce')

# Assign Tiers by calling the function
df_city['City Tier'] = df_city['2022'].apply(categorize_city)

# Read vendor data csv
df_vendor = pd.read_csv('./data/vendors.csv', delimiter=",", low_memory=False)

# Extract relevant data
df_vendor_subset = df_vendor[['Vendor ID', 'Vendor Name Platform', 'Brand', 'City', 'Region', 'Legal Entity Name']]

df_city = df_city.rename(columns={'2022': 'Population', 'Name': 'Vendor_City'})

# Merge Vendor Data with Tiers
df_merged = pd.merge(df_city, df_vendor_subset, left_on='Vendor_City', right_on='City')
# inplace=True -> modify original df, axis=1 -> column (row -> axis=0)
# df_merged.drop('City', axis=1, inplace=True)

# Read food order csv
df_orders = pd.read_csv('./data/food_orders.csv', delimiter=",", low_memory=False)

# Extract relevant data
df_orders_subset = df_orders[['Vendor ID','Vendor Name','Order ID','Ordered At','Vendor Region','Brand','Order Source Name','Order Source Type', 'Fulfilment Type', 'Gmv', 'Rating Food', 'Rating Delivery', 'Vouchers Total Value Gross']]

# Merge Vendor Data with Tiers through Vendor Data <> Tier DF
df_orders_per_vendor = pd.merge(df_merged, df_orders_subset, on='Vendor ID')


Calculations

In [28]:
# City Metrics

# No. of orders per city
city_order_counts = (df_orders_per_vendor.groupby('Vendor_City')['Order ID'].count()).astype(float)

# No. of vendors per city
vendor_counts = df_orders_per_vendor.groupby('Vendor_City')['Vendor ID'].nunique()

# Population per City
population_per_city = df_city.set_index('Vendor_City')['Population']

# Avg. Order Volume per Resident per City
city_avg_order_volume_per_resident = (city_order_counts / population_per_city).dropna()

# Avg. Food Rating per City
avg_food_rating_per_city = df_orders_per_vendor.groupby('Vendor_City')['Rating Food'].mean()

# Avg. Delivery Rating per City
avg_delivery_rating_per_city = df_orders_per_vendor.groupby('Vendor_City')['Rating Delivery'].mean()


# Generate CSV

df_city_metrics = pd.DataFrame({
    'Total No. of orders': city_order_counts,
    'Total No. of vendors': vendor_counts,
    'Population': population_per_city,
    'Avg. Order Volume per Resident per City': city_avg_order_volume_per_resident,
    'Avg. Food Rating': avg_food_rating_per_city,
    'Avg. Delivery Rating': avg_delivery_rating_per_city
})

# Merging df city and metrics df to include the City Tier
df_city_metrics.reset_index(inplace=True)
df_city_metrics.rename(columns={'index': 'City'}, inplace=True)
df_city_metrics = df_city_metrics.merge(df_city[['Vendor_City', 'City Tier']], on='Vendor_City', how='left')
# df_city_metrics.drop('Name', axis=1, inplace=True)

df_city_metrics.to_csv('city_metrics.csv', index=False)

In [29]:
# Vendor Metrics

# No. of Orders per Vendor
total_orders_per_vendor = df_orders_per_vendor.groupby('Vendor ID')['Order ID'].count().sort_values(ascending=False)

# AOV per vendor
avg_order_value_per_vendor = df_orders_per_vendor.groupby('Vendor ID')['Gmv'].mean()

# Avg. food rating per vendor
avg_food_rating_per_vendor = df_orders_per_vendor.groupby('Vendor ID')['Rating Food'].mean()

# Avg. delivery rating per vendor
avg_delivery_rating_per_vendor = df_orders_per_vendor.groupby('Vendor ID')['Rating Delivery'].mean()

# Avg. daily orders per vendor
df_orders_per_vendor['Ordered At'] = pd.to_datetime(df_orders_per_vendor['Ordered At'], utc=True).dt.date

vendor_first_order_date = pd.to_datetime(df_orders_per_vendor.groupby('Vendor ID')['Ordered At'].min())
vendor_last_order_date = pd.to_datetime(df_orders_per_vendor.groupby('Vendor ID')['Ordered At'].max())
vendor_operating_days = (vendor_last_order_date - vendor_first_order_date).dt.days + 1

avg_daily_orders_per_vendor = total_orders_per_vendor / vendor_operating_days


df_vendor_metrics = pd.DataFrame({
    'Total No. of Orders': total_orders_per_vendor,
    'AOV': avg_order_value_per_vendor,
    'Avg. Food Rating': avg_food_rating_per_vendor,
    'Avg. Delivery Rating': avg_delivery_rating_per_vendor,
    'Days operating': vendor_operating_days,
    'Avg. Daily orders': avg_daily_orders_per_vendor,
    'First order date': vendor_first_order_date,
    'Last order date': vendor_last_order_date
})

df_vendor_metrics.reset_index(inplace=True)
# Generate CSV

df_vendor_metrics.to_csv('vendor_metrics.csv', index=False)



                Vendor ID  Total No. of Orders        AOV  Avg. Food Rating  \
0        DE_Augsburg_0001                  127  25.946457          3.888889   
1        DE_Augsburg_0002                  523  28.798719          3.361702   
2    DE_BadSchwartau_0001                   28  20.403571          3.714286   
3          DE_Berlin_0001                 1626  25.398942          3.816216   
4          DE_Berlin_0002                 2102  26.205961          4.373950   
..                    ...                  ...        ...               ...   
185           DE_Ulm_0002                 2548  31.426805          3.968750   
186        DE_Werder_0001                  937  31.148933          4.285714   
187        DE_Werder_0002                  438  28.889840          2.333333   
188     DE_Wuppertal_0001                 2179  27.113525          2.860000   
189     DE_Wuppertal_0002                  154  32.300974          2.962963   

     Avg. Delivery Rating  Days operating  Avg. Dai

In [30]:
# Tier metrics

# Total No. of orders per Tier
tier_order_counts = df_orders_per_vendor.groupby('City Tier')['Order ID'].count().astype(float)

print(tier_order_counts)

# AOV per Tier
aov_per_tier = df_orders_per_vendor.groupby('City Tier')['Gmv'].mean()

# Total GMV per Tier
total_gmv_per_tier = df_orders_per_vendor.groupby('City Tier')['Gmv'].sum()

# Total Population per Tier

# list of unique cities where orders have been placed -> filter df_city for these values
unique_cities_with_orders = df_orders_per_vendor['Vendor_City'].unique()
df_city_with_orders = df_city[df_city['Vendor_City'].isin(unique_cities_with_orders)]
total_population_per_tier = df_city_with_orders.groupby('City Tier')['Population'].sum().astype(float)

# Avg. Order Volume per Resident Per Tier (Count of orders/Population in active cities)
tier_avg_order_volume_per_resident = tier_order_counts / total_population_per_tier

# Avg. Food Rating per Tier
avg_food_rating_per_tier = df_orders_per_vendor.groupby('City Tier')['Rating Food'].mean()

# Avg. Delivery Rating per Tier
avg_delivery_rating_per_tier = df_orders_per_vendor.groupby('City Tier')['Rating Delivery'].mean()

# No. of different fulfilment types per Tier
# total_no_own_delivery = df_orders_per_vendor.groupby('Fulfilment Type')['Order ID'].count()

# No. of unique Vendors per Tier
count_vendors_per_tier = df_orders_per_vendor.groupby('City Tier')['Vendor ID'].nunique()

# Avg. Daily Orders per Vendor per Tier

df_orders_per_vendor['Ordered At'] = pd.to_datetime(df_orders_per_vendor['Ordered At'], utc=True).dt.date
avg_daily_orders_per_vendor_per_tier = total_orders_per_vendor / vendor_operating_days
vendor_tiers = df_orders_per_vendor[['Vendor ID', 'City Tier']].drop_duplicates()
df_avg_daily_orders = pd.merge(avg_daily_orders_per_vendor.reset_index(), vendor_tiers, on='Vendor ID')

# Avg. Daily Orders per tier in general
avg_daily_orders_per_vendor_per_tier = df_avg_daily_orders.groupby('City Tier')[0].mean() 

# Combine all metrics into a single DataFrame
df_tier_metrics = pd.DataFrame({
    'Total No. of orders': tier_order_counts,
    'AOV': aov_per_tier,
    'Total GMV': total_gmv_per_tier,
    'Total Population': total_population_per_tier,
    'Avg. Order Volume per Resident': tier_avg_order_volume_per_resident,
    # 'No. of different fulfilment types': total_no_own_delivery,
    'No. of unique Vendors': count_vendors_per_tier,
    'Avg. Daily Orders per Vendor': avg_daily_orders_per_vendor_per_tier,
    'Avg. food rating': avg_food_rating_per_tier,
    'Avg. delivery rating': avg_delivery_rating_per_tier
})

City Tier
Tier 1    115021.0
Tier 2     44180.0
Tier 3     73204.0
Tier 4     29603.0
Name: Order ID, dtype: float64


In [31]:
# Overall metrics

# Total No. of Orders
total_orders = df_orders_per_vendor['Order ID'].count()

# Avg. No. of Orders
avg_number_of_orders_per_vendor = df_orders_per_vendor.groupby('Vendor ID')['Order ID'].count().mean()

# No. of unique Vendors
count_vendors = df_orders_per_vendor['Vendor ID'].nunique()

# Total GMV
total_gmv = total_gmv_per_tier

df_metrics_general = pd.Series({
    'Total No. of Orders': total_orders,
    'Avg. No. of Orders per Vendor': avg_number_of_orders_per_vendor,
    'No. of unique Vendors': count_vendors,
})

# Reset the index
df_tier_metrics.reset_index(inplace=True)

# Export to CSV

df_metrics_general.to_csv('general_metrics.csv', index=False)
df_tier_metrics.to_csv('tier_metrics.csv', index=False)