# Apriori algorithm

#### we used the FIM package. You can find it here for Ananconda installation https://anaconda.org/conda-forge/pyfim and here for pypi installation https://pypi.org/project/fim/.

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fim import apriori

#### Pattern mining on relational data

In [10]:
dataset = pd.read_csv("../datasets/small_transactions.csv", skipinitialspace=True, sep=',', nrows=2000)
dataset.head()

Unnamed: 0,SCONTRINO_ID,COD_MKT_ID
0,2558064013053,1580
1,2558064013053,1661
2,2558064013053,2068
3,2558064013053,2556
4,2558064013053,2650


##### This is a relational representation: each row of the dataframe has a basket ID and one product bought. For our analysis, we need the transactions: the list of product bought for each basket. 

In [11]:
transactions = dataset.groupby('SCONTRINO_ID')['COD_MKT_ID'].apply(list)
transactions.head()

SCONTRINO_ID
2558064013053                 [1580, 1661, 2068, 2556, 2650, 4225]
2558064013054    [437, 1278, 1614, 2089, 2243, 2245, 2443, 2551...
2558064013055                         [151, 595, 2650, 4600, 4872]
2558064013056             [142, 437, 2499, 2515, 3458, 3675, 4044]
2558064013057                                          [437, 3087]
Name: COD_MKT_ID, dtype: object

In [12]:
baskets = transactions.values

In [13]:
baskets[0:5]

array([list([1580, 1661, 2068, 2556, 2650, 4225]),
       list([437, 1278, 1614, 2089, 2243, 2245, 2443, 2551, 3448, 6172]),
       list([151, 595, 2650, 4600, 4872]),
       list([142, 437, 2499, 2515, 3458, 3675, 4044]), list([437, 3087])],
      dtype=object)

#### We can now run the apriori algorithm, implemented in fim. To know the options available:

In [15]:
help(apriori)

Help on built-in function apriori in module fim:

apriori(...)
    apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
             eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
             border=None)
    Find frequent item sets with the Apriori algorithm.
    tracts  transaction database to mine (mandatory)
            The database must be an iterable of transactions;
            each transaction must be an iterable of items;
            each item must be a hashable object.
            If the database is a dictionary, the transactions are
            the keys, the values their (integer) multiplicities.
    target  type of frequent item sets to find     (default: s)
            s/a   sets/all   all     frequent item sets
            c     closed     closed  frequent item sets
            m     maximal    maximal frequent item sets
            g     gens       generators
            r     rules      association rules
    supp    minimum support of an i

#### First, we want to extract the itemsets: we do it setting target=a

In [16]:
itemsets = apriori(baskets, supp=1, zmin=2, zmax=5, target='a') 

#### We can now see the itemsets obtained and their support

In [21]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 392


In [23]:
itemsets[20:35]

[((4461, 920), 3),
 ((142, 393), 3),
 ((1258, 2650), 3),
 ((2518, 2499), 3),
 ((2518, 920), 3),
 ((599, 597), 3),
 ((2550, 2650), 3),
 ((4136, 6172), 3),
 ((3074, 437), 3),
 ((1658, 445), 4),
 ((1658, 445, 2650), 3),
 ((1658, 2650), 3),
 ((1429, 1428), 4),
 ((207, 2650), 3),
 ((563, 2194), 3)]

#### With apriori we can also extract the rules, by setting target=r. Remeber that in this case we need to set a support value as well as a confidence one

In [26]:
rules = apriori(baskets, supp=1, zmin=2, target='r', conf=60, report='ascl') 

In [27]:
print('Number of rule:', len(rules))

Number of rule: 1096


In [32]:
#visualization of one rule
rules[100]

(2548,
 (2193, 2499, 920),
 2,
 0.00881057268722467,
 0.6666666666666666,
 75.66666666666667)

In [44]:
count = 0
for r in rules:
    if r[5] > 2 and r[4] > 0.7:
        print(r)
        count += 1
        if count == 10:
            break

(2443, (3086,), 3, 0.013215859030837005, 1.0, 9.869565217391305)
(2050, (441,), 3, 0.013215859030837005, 0.75, 13.096153846153847)
(2650, (441,), 3, 0.013215859030837005, 0.75, 2.541044776119403)
(2243, (1278,), 4, 0.01762114537444934, 1.0, 14.1875)
(437, (4805,), 3, 0.013215859030837005, 1.0, 9.458333333333334)
(2650, (396,), 3, 0.013215859030837005, 1.0, 3.388059701492537)
(2650, (4163,), 3, 0.013215859030837005, 1.0, 3.388059701492537)
(2729, (385,), 3, 0.013215859030837005, 0.75, 8.5125)
(2650, (4182, 445), 3, 0.013215859030837005, 1.0, 3.388059701492537)
(445, (4182, 2650), 3, 0.013215859030837005, 1.0, 5.27906976744186)


#### Given a rule, we can print the baksets that contain the premises of the rule

In [48]:
for b in baskets:
    if set(rules[9][1]) < set(b):
            print(b)

[505, 566, 588, 597, 607, 629, 632, 2058, 2727, 2729, 2731, 4003, 4126, 4228]
[588, 597, 598, 615, 620, 622, 623, 624, 1508, 4801]
[588, 2014, 4180]


# Pattern mining on tabular data

In [50]:
df = pd.read_csv("../datasets/titanic.csv", skipinitialspace=True, sep=',')

In [51]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Some pre-processing steps

In [52]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [53]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df['Age'] = df['Age'].groupby([df['Sex'], df['Pclass']]).apply(
    lambda x: x.fillna(x.median()))
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64

In [54]:
column2drop = ['PassengerId', 'Name', 'Cabin', 'SibSp', 
               'Parch', 'Ticket']
df.drop(column2drop, axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,22.0,7.25,S,2
1,1,1,female,38.0,71.2833,C,2
2,1,3,female,26.0,7.925,S,1
3,1,1,female,35.0,53.1,S,2
4,0,3,male,35.0,8.05,S,1


In [55]:
df['AgeBin'] = pd.cut(df['Age'].astype(int), 10, right=False)
df['FareBin'] = pd.cut(df['Fare'].astype(int), 10, right=False)

df.drop(['Age', 'Fare'], axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,0,3,male,S,2,"[16.0, 24.0)","[0.0, 51.2)"
1,1,1,female,C,2,"[32.0, 40.0)","[51.2, 102.4)"
2,1,3,female,S,1,"[24.0, 32.0)","[0.0, 51.2)"
3,1,1,female,S,2,"[32.0, 40.0)","[51.2, 102.4)"
4,0,3,male,S,1,"[32.0, 40.0)","[0.0, 51.2)"


In [56]:
df['Survived'] = df['Survived'].map(
    {0: 'Not Survived', 1: 'Survived'}).astype(str)
df['Pclass'] = df['Pclass'].map(
    {1: '1st', 2: '2nd', 3: '3rd'}).astype(str)

df['FamilySize'] = df['FamilySize'].astype(str) + '_Family'
df['AgeBin'] = df['AgeBin'].astype(str) + '_Age'
df['FareBin'] = df['FareBin'].astype(str) + '_Fare'

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,Not Survived,3rd,male,S,2_Family,"[16.0, 24.0)_Age","[0.0, 51.2)_Fare"
1,Survived,1st,female,C,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
2,Survived,3rd,female,S,1_Family,"[24.0, 32.0)_Age","[0.0, 51.2)_Fare"
3,Survived,1st,female,S,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
4,Not Survived,3rd,male,S,1_Family,"[32.0, 40.0)_Age","[0.0, 51.2)_Fare"


In [57]:
baskets = df.values.tolist()

In [58]:
baskets[0]

['Not Survived',
 '3rd',
 'male',
 'S',
 '2_Family',
 '[16.0, 24.0)_Age',
 '[0.0, 51.2)_Fare']

#### We apply apriori on the baskets

In [59]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a') 

In [60]:
rules = apriori(baskets, supp=10, zmin=2, target='r', conf=60, 
                report='ascl') 

In [61]:
for r in rules:
    if r[0] == 'Survived':
        print(r)

('Survived', ('[51.2, 102.4)_Fare',), 68, 0.07631874298540965, 0.6601941747572816, 1.719979560551865)
('Survived', ('[16.0, 24.0)_Age', 'female'), 68, 0.07631874298540965, 0.68, 1.7715789473684211)
('Survived', ('1st', 'female'), 91, 0.10213243546576879, 0.9680851063829787, 2.5221164613661813)
('Survived', ('1st',), 136, 0.1526374859708193, 0.6296296296296297, 1.6403508771929824)
('Survived', ('female', '1_Family', '[0.0, 51.2)_Fare'), 73, 0.0819304152637486, 0.73, 1.901842105263158)
('Survived', ('female', '1_Family'), 99, 0.1111111111111111, 0.7857142857142857, 2.0469924812030076)
('Survived', ('female', 'S', '[0.0, 51.2)_Fare'), 100, 0.1122334455667789, 0.6329113924050633, 1.6489007328447702)
('Survived', ('female', 'S'), 142, 0.15937149270482603, 0.6926829268292682, 1.8046213093709884)
('Survived', ('female', '[0.0, 51.2)_Fare'), 153, 0.1717171717171717, 0.6681222707423581, 1.7406343369340382)
('Survived', ('female',), 233, 0.2615039281705948, 0.7420382165605095, 1.9332048273550118

In [63]:
for r in rules:
    if r[0] == 'Not Survived':
        print(r)

('Not Survived', ('[40.0, 48.0)_Age',), 66, 0.07407407407407407, 0.6947368421052632, 1.127523727351165)
('Not Survived', ('[32.0, 40.0)_Age', '[0.0, 51.2)_Fare'), 64, 0.0718294051627385, 0.6881720430107527, 1.116869381279746)
('Not Survived', ('C', 'male'), 66, 0.07407407407407407, 0.6947368421052632, 1.127523727351165)
('Not Survived', ('2nd', '1_Family', 'S', '[0.0, 51.2)_Fare'), 62, 0.06958473625140292, 0.6666666666666666, 1.0819672131147542)
('Not Survived', ('2nd', '1_Family', 'S'), 64, 0.0718294051627385, 0.6736842105263158, 1.0933563416738568)
('Not Survived', ('2nd', '1_Family', '[0.0, 51.2)_Fare'), 66, 0.07407407407407407, 0.6470588235294118, 1.0501446480231438)
('Not Survived', ('2nd', '1_Family'), 68, 0.07631874298540965, 0.6538461538461539, 1.0611601513240858)
('Not Survived', ('2nd', 'male', 'S', '[0.0, 51.2)_Fare'), 77, 0.08641975308641975, 0.8369565217391305, 1.3583392729864576)
('Not Survived', ('2nd', 'male', 'S'), 82, 0.0920314253647587, 0.845360824742268, 1.371979043