# NDSC Market Basket Challenge 2020
By: M Sammy Ivan Kurniawan

## 1. Initialization
This part will consist of initialization, importing libraries and datasets, and creating grouped as well as dummy datasets for the iterative processes that will be done to surpass the Challenge.

In [1]:
# Import needed libraries
import pandas as pd
import numpy as np
import re
import math
from time import sleep
from tqdm.notebook import tqdm

In [2]:
# Import the datasets
orderdf = pd.read_csv(r'D:\Sammy\Competitions\NDSC\Market Basket\association_order.csv')
rulesdf = pd.read_csv(r'D:\Sammy\Competitions\NDSC\Market Basket\rules.csv')

In [3]:
# Get a glimpse of the dataset
orderdf.head()

Unnamed: 0,orderid,itemid
0,31379820545759,719740607
1,31378575577269,1825360194
2,31369591568249,1108903291
3,31369836201769,4507360843
4,31372360246729,1821888475


In [4]:
rulesdf.head()

Unnamed: 0,rule
0,100242812>80361758
1,100242812>89031406
2,1003153762>1016449477
3,1006024995>2727415265
4,1006024995>866012366


In [5]:
# Create a dummy only for the submission
submitdf = rulesdf.copy()

In [6]:
# Group the item id based on the orders
groupeditems = orderdf.groupby('orderid')['itemid'].apply(list)
groupeddf = pd.DataFrame(groupeditems)
groupeddf

Unnamed: 0_level_0,itemid
orderid,Unnamed: 1_level_1
31338000509845,"[2662778119, 2255381856, 1825056968, 575945551..."
31338001158725,"[6112866826, 5013004529, 5712867361]"
31338002167561,"[5712106216, 3310344299, 6303964568, 331073883..."
31338004895579,"[911192064, 760247001, 5410312558, 1142881028]"
31338005995334,"[3700032699, 6900080063, 5305001639]"
...,...
31384797642506,"[1857408808, 2551150454, 1462329331]"
31384797750713,"[1149557119, 1427652419, 2147901507]"
31384798826576,"[3408618419, 1985704481, 3300302478, 1303266138]"
31384799224120,"[4002867093, 6410154560, 3510249471]"


In [7]:
# Split the rules so it can be understood easier in the algorithm
rulesdf['splitted'] = rulesdf['rule'].apply(lambda x : re.split('>', x))
rulesdf[['left_split','right_split']] = pd.DataFrame(rulesdf.splitted.tolist(), index= rulesdf.index)
rulesdf = rulesdf.drop(columns = 'splitted')
rulesdf['left_split'] = rulesdf['left_split'].apply(lambda x : re.split('&', x))
rulesdf['right_split'] = rulesdf['right_split'].apply(lambda x : re.split('&', x))
rulesdf

Unnamed: 0,rule,left_split,right_split
0,100242812>80361758,[100242812],[80361758]
1,100242812>89031406,[100242812],[89031406]
2,1003153762>1016449477,[1003153762],[1016449477]
3,1006024995>2727415265,[1006024995],[2727415265]
4,1006024995>866012366,[1006024995],[866012366]
...,...,...,...
14233,995073047>3202007524,[995073047],[3202007524]
14234,995073047>651958908,[995073047],[651958908]
14235,995073047>7902698606,[995073047],[7902698606]
14236,995073047>922394800,[995073047],[922394800]


## 2. Function Construction
These are functions that works with the grouped dataframe to acquire the frequency of said number of orders in the grouped dataframe.

In [8]:
# Specify Functions to get the total number of orders for each product composition
def get_total_one_p(df,product_a):
    """ A function to acquire the frequency of a certain product in a product composition"""
    return len(df[df['itemid'].apply(lambda x : product_a in x)])
    
def get_total_two_p(df,product_a,product_b):
    """ A function to acquire the frequency of a 2 products in a product composition"""
    return len(df[df['itemid'].apply(lambda x : product_a in x and product_b in x)])
    
def get_total_three_p(df,product_a,product_b,product_c):
    """ A function to acquire the frequency of a 3 products in a product composition"""
    return len(df[df['itemid'].apply(lambda x : product_a in x and product_b in x and product_c in x)])

def get_confidence(left_q, all_q):
    """A function to acquire the confidence using Association Rule Confidence formula"""
    return all_q/left_q

## 3. The Main Code
This main code will be the main process to iterate over the whole dataset, in order to find the frequency of each products related, and calculate the confidence for said product relationship.

In [None]:
# Main Code to iterate through everything
confidence = []
progress = 0
for i in tqdm(range(0, len(rulesdf)), desc ="Iteration Process"):
    
    A_LIST = rulesdf.loc[i]['left_split']
    B_LIST = rulesdf.loc[i]['right_split']
    
    # A&B > C
    if len(rulesdf.loc[i]['left_split']) > 1: 
        A = int(A_LIST[0])
        B = int(A_LIST[1])
        C = int(B_LIST[0])
        left_q = get_total_two_p(groupeddf, A, B)
        all_q = get_total_three_p(groupeddf, A, B, C)
    
    # A > B&C
    elif len(rulesdf.loc[i]['right_split']) > 1: 
        A = int(A_LIST[0])
        B = int(B_LIST[0])
        C = int(B_LIST[1])
        left_q = get_total_one_p(groupeddf, A)
        all_q = get_total_three_p(groupeddf, A, B, C)
    
    # A > B
    else: 
        A = int(A_LIST[0])
        B = int(B_LIST[0])
        left_q = get_total_one_p(groupeddf, A)
        all_q = get_total_two_p(groupeddf, A, B)
    
    confidence.append(get_confidence(left_q,all_q))
    sleep(0)
    
submitdf['confidence'] = confidence
submitdf['confidence'] = submitdf['confidence'].apply(lambda x : math.floor(x * 1000))
submitdf.to_csv(r'D:\Sammy\Competitions\NDSC\Market Basket\Iterative_V1.csv',index = False)
submitdf

HBox(children=(HTML(value='Iteration Process'), FloatProgress(value=0.0, max=14238.0), HTML(value='')))

It is found that it took around 35 minutes to complete this task.