# Data Preprocessing Tools

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing the libraries

In [2]:
!pip install apyori



Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5974 sha256=fb73f10ec99c8a5cc7a51c9a706d6dc389717a8800b3eb8af600750da2b7f6c4
  Stored in directory: /root/.cache/pip/wheels/cb/f6/e1/57973c631d27efd1a2f375bd6a83b2a616c4021f24aab84080
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Preprocessing


In [4]:
dataset = pd.read_csv('/content/drive/MyDrive/Dataset/apriori/Market_Basket_Optimisation.csv', header = None)
transactions = []
for i in range(0,7501):
  transactions.append([str(dataset.values[i,j]) for j in range(0,20)])


## Training the eclat model on the dataset

In [5]:
from apyori import apriori

# min_support : basically how much of the assosiation is required 
# For example: Here the data set is weekly. 
# Now , I require a product which appear 3 times a day. 
# It means 7 * 3 weekly
# Support for 21 ==> 21 / 7501 = 0.0027 ~ 0.003


# min_confidence : rule of thumb is we need to BRUTE force it. 

# lift : The undestanding comes with experience in doing project. 

# min_length, max_length : : Rules when told would be in form of M1 -> M2 . 
# i.e one product in on LHS and one product on the RHS    
# Here buy one product a , and get at product b 2 



rules = apriori(transactions = transactions, min_support = 0.003, min_confidence = 0.2, min_lift =  3, min_length = 2, max_length = 2)

## Visualizing the result

## Displaying the first results coming directly fromt the output of apriori function

In [6]:
results = list(rules)

# A single row of the   

# RelationRecord(
# 0 : items=frozenset({'light cream', 'chicken'}),
# Means: light cream -> chicken



# 1 : support=0.004532728969470737, 
# 2: ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}),
# items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)])

# Means: light cream -> chicken , with the chances of 29%, support = 0.0045, 


results

[RelationRecord(items=frozenset({'chicken', 'light cream'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)]),
 RelationRecord(items=frozenset({'mushroom cream sauce', 'escalope'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'mushroom cream sauce'}), items_add=frozenset({'escalope'}), confidence=0.3006993006993007, lift=3.790832696715049)]),
 RelationRecord(items=frozenset({'pasta', 'escalope'}), support=0.005865884548726837, ordered_statistics=[OrderedStatistic(items_base=frozenset({'pasta'}), items_add=frozenset({'escalope'}), confidence=0.3728813559322034, lift=4.700811850163794)]),
 RelationRecord(items=frozenset({'fromage blanc', 'honey'}), support=0.003332888948140248, ordered_statistics=[OrderedStatistic(items_base=frozenset({'fromage blanc'}), items_add=frozenset({'honey'}), confidence=0

## Putting the results well organized into a Pandas DataFrame

In [19]:
# Representation is very important
# Comes handy in explaining it to some one else


def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]

    return list(zip(lhs, rhs, supports))
resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Product 1', 'Product 2', 'Support'])


## Displaying the results non sorted. 

In [20]:
resultsinDataFrame

Unnamed: 0,Product 1,Product 2,Support
0,light cream,chicken,0.004533
1,mushroom cream sauce,escalope,0.005733
2,pasta,escalope,0.005866
3,fromage blanc,honey,0.003333
4,herb & pepper,ground beef,0.015998
5,tomato sauce,ground beef,0.005333
6,light cream,olive oil,0.0032
7,whole wheat pasta,olive oil,0.007999
8,pasta,shrimp,0.005066


## Displaying the results sorted by desending lifts


In [21]:
# n : rows 
# columns: Column which you want the  dataframe needs to be sorted


resultsinDataFrame.nlargest(n= 10, columns = 'Support' )

Unnamed: 0,Product 1,Product 2,Support
4,herb & pepper,ground beef,0.015998
7,whole wheat pasta,olive oil,0.007999
2,pasta,escalope,0.005866
1,mushroom cream sauce,escalope,0.005733
5,tomato sauce,ground beef,0.005333
8,pasta,shrimp,0.005066
0,light cream,chicken,0.004533
3,fromage blanc,honey,0.003333
6,light cream,olive oil,0.0032
