# Dataset 4: Finding Association Rules on School Supplies Dataset 

*Notebook by: Allen Aboy, Franchezka Cruz, Christopher Pinpin - CSMODEL S12*

### Import Libaries 

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### The Dataset

For this notebook, we will be working on `Dataset 4`, whose context is ***school supplies*** bought in a shop.Each row in the dataset contains a list of items bought by a customer for a single transaction. 

### Reading the Dataset

To be able to explore our dataset, our first step is to load it using pandas. We will also be converting items represented as `float` to `int`.

In [41]:
supplies_df = pd.read_csv('Dataset4.csv', header = None)
supplies_df = supplies_df.astype(pd.Int32Dtype()) # convert float to int
supplies_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,3,5,,,,,,
1,1,2,,,,,,
2,5,9,10,11,12,14,,
3,2,5,6,13,,,,
4,3,,,,,,,
...,...,...,...,...,...,...,...,...
295,1,12,,,,,,
296,9,11,12,15,17,,,
297,1,6,7,8,10,15,18,
298,16,,,,,,,


In [42]:
supplies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       300 non-null    Int32
 1   1       257 non-null    Int32
 2   2       213 non-null    Int32
 3   3       183 non-null    Int32
 4   4       152 non-null    Int32
 5   5       115 non-null    Int32
 6   6       71 non-null     Int32
 7   7       33 non-null     Int32
dtypes: Int32(8)
memory usage: 11.8 KB


State the name of the dataset. Describe the structure of the dataset file. How many observations are there in the dataset? How many variables are there in the dataset?

THIS NEEDS TO BE SHOWN IN THE FORMAT 

BASKET 0 [3 5]

BASKET 1 [1 2]

PERO NDI KO MAGAWA NG WALANG NA T___T

In [43]:
supplies_df.values.tolist()

[[3, 5, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [1, 2, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [5, 9, 10, 11, 12, 14, <NA>, <NA>],
 [2, 5, 6, 13, <NA>, <NA>, <NA>, <NA>],
 [3, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [0, 2, 8, 9, 11, 13, <NA>, <NA>],
 [2, 9, 11, 13, 19, <NA>, <NA>, <NA>],
 [4, 5, 6, 7, 8, <NA>, <NA>, <NA>],
 [0, 8, 15, 16, <NA>, <NA>, <NA>, <NA>],
 [0, 5, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [4, 17, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [8, 10, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [2, 5, 9, 12, 17, 18, <NA>, <NA>],
 [6, 8, 9, 16, 17, 18, 19, <NA>],
 [7, 12, 15, 17, <NA>, <NA>, <NA>, <NA>],
 [9, 12, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [0, 3, 8, 10, 13, 16, 19, <NA>],
 [1, 8, 15, <NA>, <NA>, <NA>, <NA>, <NA>],
 [7, 9, 11, 13, 19, <NA>, <NA>, <NA>],
 [4, 10, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>],
 [3, 9, 16, <NA>, <NA>, <NA>, <NA>, <NA>],
 [0, 1, 3, 8, 9, 11, 15, 18],
 [0, 7, 8, 9, 17, 18, 19, <NA>],
 [1, 14, 15, 17, 18, <NA>, <NA>, <NA>],
 [17, <NA>, <NA>, <NA>, <NA>, <

In [44]:
values = supplies_df.values.ravel()
values = [value for value in pd.unique(values) if not pd.isnull(value)]

value_dict = {}
for i, value in enumerate(values):
    value_dict[value] = i
    
temp_df = supplies_df.stack().map(value_dict).unstack()

carts = []
for i in range(temp_df.shape[0]):
    cart = np.sort([int(x) for x in temp_df.iloc[i].values.tolist() if str(x) != 'nan'])
    carts.append(cart)
    
supplies_df = pd.DataFrame([[0 for _ in range(20)] for _ in range(300)], columns=values)

for i, cart in enumerate(carts):
    supplies_df.iloc[i, cart] = 1

    
supplies_df = supplies_df.reindex(sorted(supplies_df.columns), axis=1)

In [45]:
supplies_df.columns = ['Bond Paper','Index Card','Intermediate Pad Paper',
                        'Construction Paper','Pencil','Sharpener',
                        'Eraser','Marker','Yellow Pad',
                        'Crayon','Whiteboard','Ruler',
                        'Graph Paper','Protractor','Paint Brush',
                        'Poster Paint','Acrylic Paint','Watercolor',
                        'Canvas','Notebook']
supplies_df

Unnamed: 0,Bond Paper,Index Card,Intermediate Pad Paper,Construction Paper,Pencil,Sharpener,Eraser,Marker,Yellow Pad,Crayon,Whiteboard,Ruler,Graph Paper,Protractor,Paint Brush,Poster Paint,Acrylic Paint,Watercolor,Canvas,Notebook
0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,0,0
3,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
296,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,1,0,0
297,0,1,0,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,1,0
298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


# DI KO ALAM IF NECESSARY PA TO PERO FEEL KO NDI NA KASI NUMBERS NA SYA MINAMAP LANG TO INTO DICTIONARY

In [51]:
baskets = []
for i in range(temp_df.shape[0]):
    basket = np.sort([int(x) for x in temp_df.iloc[i].values.tolist() if str(x) != 'nan'])
    baskets.append(basket)

In [49]:
for i, basket in enumerate(baskets):
    print('Basket', i, basket)

Basket 0 [ 2 11]
Basket 1 [0 3]
Basket 2 [ 1  2  9 12 14 15]
Basket 3 [0 2 4 5]
Basket 4 [11]
Basket 5 [ 0  5  6  7  9 14]
Basket 6 [ 0  5  9 10 14]
Basket 7 [ 2  4  7  8 16]
Basket 8 [ 6  7 17 18]
Basket 9 [2 6]
Basket 10 [ 8 19]
Basket 11 [1 7]
Basket 12 [ 0  2 13 14 15 19]
Basket 13 [ 4  7 10 13 14 18 19]
Basket 14 [15 16 17 19]
Basket 15 [14 15]
Basket 16 [ 1  5  6  7 10 11 18]
Basket 17 [ 3  7 17]
Basket 18 [ 5  9 10 14 16]
Basket 19 [1 8]
Basket 20 [11 14 18]
Basket 21 [ 3  6  7  9 11 13 14 17]
Basket 22 [ 6  7 10 13 14 16 19]
Basket 23 [ 3 12 13 17 19]
Basket 24 [19]
Basket 25 [ 0  1  3  6  7 10 16 19]
Basket 26 [ 1  2  3  6 13 14]
Basket 27 [18]
Basket 28 [ 7 13 15]
Basket 29 [ 0  4 10 12 14 15]
Basket 30 [ 0  1  3  4  6 13]
Basket 31 [ 2  4  7  8  9 10 11 16]
Basket 32 [14]
Basket 33 [ 0  5  8 11 15]
Basket 34 [17]
Basket 35 [16]
Basket 36 [ 0  6 12 14 16]
Basket 37 [ 0  4  5  8 15 19]
Basket 38 [ 0  2  4  5  8  9 18]
Basket 39 [ 3  6 18]
Basket 40 [ 3  4  7  8 10 12]
Basket 4

In [133]:
values = supplies_df.values.ravel()
values = [value for value in pd.unique(values) if not pd.isnull(value)]

value_dict = {}
for i, value in enumerate(values):
    value_dict[value] = i
    
print(value_dict)
print("Unique values: " + str(len(values)))

{3: 0, 5: 1, 1: 2, 2: 3, 9: 4, 10: 5, 11: 6, 12: 7, 14: 8, 6: 9, 13: 10, 0: 11, 8: 12, 19: 13, 4: 14, 7: 15, 15: 16, 16: 17, 17: 18, 18: 19}
Unique values: 20


In [29]:
# mali thsis 

baskets = []

for i in range(supplies_df.shape[0]): 
    basket = [int(x) for x in supplies_df.values.tolist() if str(x) != '<NA>']

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'