# Project Market - Barein

Association analysis

The association analysis which attempts to find common patterns of items in large data sets. 

Support is the relative frequency that the rules show up. 

<br>
<center> support(X,Y) = transaction containning both X and Y / Total number of transaction<br>

Confidence is a measure of the reliability of the rule. 

<br>
<center> Confidence(X,Y) = transaction containning both X and Y / transaction containning X<br>

Lift is the ratio of the observed support to that expected if the two rules were independent.

<br>
<center> Lift(X,Y) = (transaction containning both X and Y / transaction containning X) / fration of transactions containing Y <br>


In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# read market bills
df = pd.read_csv('sample.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
df.insert(1, 'count', 1)
df.head()

Unnamed: 0,invoice,count,stockcode
0,1000,1,galletas
1,1000,1,azucar
2,1000,1,chocolate
3,1000,1,licor
4,1000,1,yogurt


In [3]:
# pivot with the market bills
df_reset = df.pivot(index='invoice', 
                    columns='stockcode', 
                    values='count')
df_reset = df_reset.fillna(0)
df_reset.head()

stockcode,aceite,arroz,aseo,azucar,cafe,carne,chocolate,cuidado personal,enlatados,frutas,...,pasta,pescado,pollo,postres,queso,sal,soda,vegetales,water,yogurt
invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1001,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1002,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1003,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1004,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [4]:
# We need just ones and zeros
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = df_reset.applymap(encode_units)
basket_sets

stockcode,aceite,arroz,aseo,azucar,cafe,carne,chocolate,cuidado personal,enlatados,frutas,...,pasta,pescado,pollo,postres,queso,sal,soda,vegetales,water,yogurt
invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,1,0,1,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1001,0,1,0,1,0,1,0,0,1,1,...,0,0,1,0,0,0,1,0,0,1
1002,0,1,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,1,1
1003,0,1,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1004,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,0,1
9996,0,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,1,1
9997,0,1,0,1,1,1,1,0,0,0,...,0,1,1,0,0,0,1,0,1,1
9998,0,0,0,1,0,0,1,0,0,0,...,0,0,1,0,1,0,1,1,1,1


In [5]:
# Now that the data is structured properly, we can generate frequent item sets that have a support of at least 7%:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
frequent_itemsets.head()



Unnamed: 0,support,itemsets
0,0.090556,(aceite)
1,0.497333,(arroz)
2,0.988333,(azucar)
3,0.216111,(carne)
4,0.638222,(chocolate)


In [6]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules = rules.sort_values(by=['confidence'], ascending=False)
rules.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
828,"(carne, helado)",(galletas),0.089222,0.996556,0.089222,1.0,1.003456,0.000307,inf
8838,"(yogurt, carne, azucar, helado)",(galletas),0.083333,0.996556,0.083333,1.0,1.003456,0.000287,inf
5312,"(yogurt, pollo, harina)",(galletas),0.070556,0.996556,0.070556,1.0,1.003456,0.000243,inf
4568,"(yogurt, carne, helado)",(galletas),0.084,0.996556,0.084,1.0,1.003456,0.000289,inf
1116,"(pollo, harina)",(galletas),0.075667,0.996556,0.075667,1.0,1.003456,0.000261,inf
2803,"(carne, azucar, helado)",(galletas),0.088556,0.996556,0.088556,1.0,1.003456,0.000305,inf
3598,"(pollo, azucar, harina)",(galletas),0.074444,0.996556,0.074444,1.0,1.003456,0.000256,inf
848,"(carne, queso)",(galletas),0.110556,0.996556,0.110444,0.998995,1.002448,0.00027,3.427222
2852,"(carne, azucar, queso)",(galletas),0.109222,0.996556,0.109111,0.998983,1.002436,0.000265,3.385889
214,"(carne, arroz)",(galletas),0.108889,0.996556,0.108778,0.99898,1.002432,0.000264,3.375556


In [7]:
# some associations
rules[(rules['antecedents'] == frozenset({'galletas'})) & (rules['consequents'] == frozenset({'yogurt'}))] 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
92,(galletas),(yogurt),0.996556,0.941778,0.938556,0.9418,1.000023,2.2e-05,1.000374


In [8]:
# reverse association
rules[(rules['antecedents'] == frozenset({'yogurt'})) & (rules['consequents'] == frozenset({'galletas'}))] 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
93,(yogurt),(galletas),0.941778,0.996556,0.938556,0.996579,1.000023,2.2e-05,1.006728


In [9]:
# number of rules
rules.shape

(23148, 9)