In [1]:
import pandas as pd
import numpy as np
import random
import os
import importlib

In [2]:
# Specify the path to the module
module_path = os.path.abspath("../../utils/dataset.py")

# Load the module
spec = importlib.util.spec_from_file_location("dataset", module_path)
dataset = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dataset)

In [3]:
# load dataset for products

product_df = pd.read_csv('../../data/synthetic/food_items.csv')
product_df.head()

Unnamed: 0,Updated At,Item Name,Category,Unit,Price (₦),Vendor Name,Vendor Id,Vendor Location,Vendor Rating,Vendor Delivery,Stock,Discount,Promotion,In Stock,Id
0,2022-05-24 22:23:49.757753504,Crackers,Beverages,1kg,74904,Vendor 1,1,"Lekki, Lagos",2,True,323,40,4.324381,False,0
1,2022-06-22 21:41:11.856453364,Sunflower Oil,Dairy,50kg bag,67944,Vendor 1,1,"Lekki, Lagos",2,True,323,40,11.734182,False,1
2,2022-06-07 21:17:50.198369628,Milk Powder,Cooking Oils,400g can,96708,Vendor 1,1,"Lekki, Lagos",2,True,323,40,4.475715,False,2
3,2022-03-24 11:39:47.889984528,Fruit Bar,Staples,100 bags,5520,Vendor 1,1,"Lekki, Lagos",2,True,323,40,5.292178,True,3
4,2022-01-05 21:32:01.169433988,Avocado Oil,Cooking Oils,100 bags,3900,Vendor 1,1,"Lekki, Lagos",2,True,323,40,10.995408,True,4


In [4]:
product_category_groups = product_df.groupby('Category')

In [5]:
product_df['Discount Rank'] = product_df['Discount'].rank(pct=True)

In [6]:
product_df['Discount Rank'] = product_df['Discount'].rank(pct=True)

In [7]:
# Generate users data
users_data = []

locations = [
    "Ikeja, Lagos", "Surulere, Lagos", "Yaba, Lagos", "Lekki, Lagos", "Ajah, Lagos", 
    "Abuja", "Port Harcourt", "Benin City", "Kano", "Ogun State", 
    "Ibadan", "Kaduna", "Asaba", "Onitsha", "Owerri", 
    "Enugu", "Makurdi", "Abeokuta", "Calabar", "Jos"
]

product_list = product_df["Id"].tolist()
vendor_list = product_df["Vendor Id"].unique().tolist()

cheap_products = product_df['Discount'].quantile()

for i in range(1, 101):
    vendor = {
        "User Name": f"User {i}",
        "Id": i,
        "Age": random.randint(18, 65),
        "User Location": random.choice(locations),
        "Saved Products": random.sample(product_list, random.randint(0, 1000)),
        "Saved Vendors": random.sample(vendor_list, random.randint(0, len(vendor_list))),
        "Searched Products": random.sample(product_list, random.randint(0, 1000)),
        "Searched Vendors": random.sample(vendor_list, random.randint(0, len(vendor_list))),
        "Proximity (KM)": random.randint(10, 100),
    }

    users_data.append(vendor)

In [8]:
def get_product_quality(id):
    product = product_df[product_df['Id'] == id].reset_index().iloc[0].to_dict()
    category_discounts = product_category_groups.get_group(product.get('Category'))['Discount']
    below = category_discounts
    print(category_discounts.quantile(.2))
    
    return product

In [9]:
get_product_quality(1)

8.0


{'index': 1,
 'Updated At': '2022-06-22 21:41:11.856453364',
 'Item Name': 'Sunflower Oil',
 'Category': 'Dairy',
 'Unit': '50kg bag',
 'Price (₦)': 67944,
 'Vendor Name': 'Vendor 1',
 'Vendor Id': 1,
 'Vendor Location': 'Lekki, Lagos',
 'Vendor Rating': 2,
 'Vendor Delivery': True,
 'Stock': 323,
 'Discount': 40,
 'Promotion': 11.73418177347797,
 'In Stock': False,
 'Id': 1,
 'Discount Rank': 0.8014767859410902}

In [10]:
users_df = pd.DataFrame(users_data)
users_df.head()

Unnamed: 0,User Name,Id,Age,User Location,Saved Products,Saved Vendors,Searched Products,Searched Vendors,Proximity (KM)
0,User 1,1,34,Benin City,"[28981, 7151, 31154, 66007, 3595, 72988, 3092,...","[68, 97, 52, 91, 38, 83, 62, 66, 43, 76, 35, 7...","[55531, 80325, 86481, 82261, 43551, 86271, 351...","[85, 78, 50, 81, 37, 42, 57, 59, 65, 51, 66, 4]",63
1,User 2,2,32,Abeokuta,"[1420, 69858, 30627, 589, 55436, 69085, 78572,...","[4, 33, 72, 3, 45, 62, 7, 29, 27]","[16538, 35820, 3299, 55503, 44904, 31035, 6228...","[24, 90, 65]",71
2,User 3,3,20,Makurdi,"[36847, 49440, 6928, 51990, 59417, 67975, 7713...","[73, 87, 33, 61, 100, 54, 58, 21, 2, 30, 88, 8...","[44491, 70472, 46283, 39063, 44602, 19966, 556...","[64, 46, 10, 18, 4, 15, 5, 61, 48, 11, 86, 55,...",44
3,User 4,4,64,"Surulere, Lagos","[42788, 80612, 71204, 59605, 85062, 56999, 467...","[52, 43, 33, 63, 24, 50, 61, 99, 74, 100, 48, ...","[13377, 73798, 44572, 42898, 20318, 8045, 3903...","[30, 90, 83, 1, 76, 39, 15, 7, 59, 89, 34, 48,...",65
4,User 5,5,63,Owerri,"[77934, 83001, 52070, 1106, 56838, 64654, 2686...","[100, 32, 57, 30, 52, 65, 73, 19, 58, 47, 95, ...","[9905, 21295, 19334, 34531, 29517, 82219, 3596...","[51, 98, 45, 65, 26, 93, 55, 90, 99, 96, 88, 5...",25


Budget
Product Quality

In [11]:
product_df['Category Codes'] = product_df['Category'].astype('category').cat.codes

In [12]:
def get_category_codes(x):
    return product_df[product_df['Id'].isin(x)]['Category Codes'].unique().tolist()

In [13]:
users_df['Saved Categories'] = users_df['Saved Products'].map(get_category_codes)

In [14]:
users_df['Searched Categories'] = users_df['Searched Products'].map(get_category_codes)

In [15]:
users_df['Interesting Products'] = users_df['Searched Products'] + users_df['Saved Products']

In [16]:
users_df['Interesting Vendors'] = users_df['Searched Vendors'] + users_df['Saved Vendors']

In [17]:
users_df['Interesting Categories'] = users_df['Searched Categories'] + users_df['Saved Categories']

In [18]:
def get_product_quality(x):
    return 100 -product_df[product_df['Id'].isin(x)]['Discount'].mean()

In [19]:
users_df['Product Qaulity'] = users_df['Interesting Products'].map(get_product_quality)

In [20]:
users_df.head()

Unnamed: 0,User Name,Id,Age,User Location,Saved Products,Saved Vendors,Searched Products,Searched Vendors,Proximity (KM),Saved Categories,Searched Categories,Interesting Products,Interesting Vendors,Interesting Categories,Product Qaulity
0,User 1,1,34,Benin City,"[28981, 7151, 31154, 66007, 3595, 72988, 3092,...","[68, 97, 52, 91, 38, 83, 62, 66, 43, 76, 35, 7...","[55531, 80325, 86481, 82261, 43551, 86271, 351...","[85, 78, 50, 81, 37, 42, 57, 59, 65, 51, 66, 4]",63,"[4, 5, 1, 3, 8, 0, 2, 7, 6]","[2, 0, 1, 8, 4, 3, 7, 5, 6]","[55531, 80325, 86481, 82261, 43551, 86271, 351...","[85, 78, 50, 81, 37, 42, 57, 59, 65, 51, 66, 4...","[2, 0, 1, 8, 4, 3, 7, 5, 6, 4, 5, 1, 3, 8, 0, ...",75.446835
1,User 2,2,32,Abeokuta,"[1420, 69858, 30627, 589, 55436, 69085, 78572,...","[4, 33, 72, 3, 45, 62, 7, 29, 27]","[16538, 35820, 3299, 55503, 44904, 31035, 6228...","[24, 90, 65]",71,"[8, 0, 3, 2, 5, 4, 6, 7, 1]","[7, 4, 1, 0, 2, 8, 6, 3, 5]","[16538, 35820, 3299, 55503, 44904, 31035, 6228...","[24, 90, 65, 4, 33, 72, 3, 45, 62, 7, 29, 27]","[7, 4, 1, 0, 2, 8, 6, 3, 5, 8, 0, 3, 2, 5, 4, ...",75.129799
2,User 3,3,20,Makurdi,"[36847, 49440, 6928, 51990, 59417, 67975, 7713...","[73, 87, 33, 61, 100, 54, 58, 21, 2, 30, 88, 8...","[44491, 70472, 46283, 39063, 44602, 19966, 556...","[64, 46, 10, 18, 4, 15, 5, 61, 48, 11, 86, 55,...",44,"[0, 4, 2, 7, 8, 6, 3, 5, 1]","[5, 3, 1, 0, 6, 7, 2, 4, 8]","[44491, 70472, 46283, 39063, 44602, 19966, 556...","[64, 46, 10, 18, 4, 15, 5, 61, 48, 11, 86, 55,...","[5, 3, 1, 0, 6, 7, 2, 4, 8, 0, 4, 2, 7, 8, 6, ...",76.004202
3,User 4,4,64,"Surulere, Lagos","[42788, 80612, 71204, 59605, 85062, 56999, 467...","[52, 43, 33, 63, 24, 50, 61, 99, 74, 100, 48, ...","[13377, 73798, 44572, 42898, 20318, 8045, 3903...","[30, 90, 83, 1, 76, 39, 15, 7, 59, 89, 34, 48,...",65,"[6, 5, 2, 3, 7, 0, 4, 8, 1]","[8, 4, 2, 6, 5, 3, 7, 1, 0]","[13377, 73798, 44572, 42898, 20318, 8045, 3903...","[30, 90, 83, 1, 76, 39, 15, 7, 59, 89, 34, 48,...","[8, 4, 2, 6, 5, 3, 7, 1, 0, 6, 5, 2, 3, 7, 0, ...",75.592541
4,User 5,5,63,Owerri,"[77934, 83001, 52070, 1106, 56838, 64654, 2686...","[100, 32, 57, 30, 52, 65, 73, 19, 58, 47, 95, ...","[9905, 21295, 19334, 34531, 29517, 82219, 3596...","[51, 98, 45, 65, 26, 93, 55, 90, 99, 96, 88, 5...",25,"[4, 3, 5, 2, 8, 1, 0, 6, 7]","[6, 0, 5, 4, 8, 7, 3, 2, 1]","[9905, 21295, 19334, 34531, 29517, 82219, 3596...","[51, 98, 45, 65, 26, 93, 55, 90, 99, 96, 88, 5...","[6, 0, 5, 4, 8, 7, 3, 2, 1, 4, 3, 5, 2, 8, 1, ...",75.717021


In [21]:
users_df['Product Qaulity'].unique()

array([75.44683467, 75.1297989 , 76.00420168, 75.59254144, 75.71702128,
       76.57397959, 75.27899687, 75.39649781, 75.57186544, 75.703125  ,
       75.82862524, 76.8190184 , 74.4906954 , 76.05761317, 76.36466165,
       75.70741097, 76.62729358, 75.41860465, 75.32613391, 76.09938734,
       75.92828036, 75.73208072, 75.40331492, 75.8605042 , 75.43878463,
       75.88342697, 76.25293427, 75.74943736, 75.57052897, 75.70760642,
       75.83820225, 76.53397028, 76.22615132, 74.75757576, 75.47450572,
       74.78549849, 75.1185006 , 76.26326964, 75.89478114, 75.60111111,
       75.07294118, 76.79379157, 75.1967033 , 73.99404762, 75.76185567,
       75.45196211, 74.88131313, 75.41346154, 75.45103858, 75.83989501,
       75.80885122, 76.13312203, 75.89980354, 75.9884696 , 75.05456702,
       76.27475248, 75.28669528, 75.96024465, 74.98603652, 75.58598726,
       76.03636364, 75.58700882, 75.72194638, 76.17584605, 74.08389262,
       76.26570916, 75.55714286, 75.57821782, 75.26177024, 77.53

In [22]:
product_df['Discount'].unique()

array([40, 25, 15, 12, 13, 23,  9,  1, 36, 24, 35, 49, 19,  8, 11,  6,  4,
       31,  7, 21, 37, 32, 14, 48, 41, 34, 17, 16, 38, 29, 20,  3, 18, 33,
       45, 50, 42, 39,  0, 27, 10, 43, 44,  5, 47,  2])

In [27]:
import scipy.stats as stats

In [25]:
scipy.stats.zscore(product_df['Discount'])

0        1.076425
1        1.076425
2        1.076425
3        1.076425
4        1.076425
           ...   
86602    0.528728
86603    0.528728
86604    0.528728
86605    0.528728
86606    0.528728
Name: Discount, Length: 86607, dtype: float64

In [None]:
product_df.ana()

TypeError: Must provide 'func' or tuples of '(column, aggfunc).