First, install the python package called "pymining" by typing **pip install mlxtend** in cmd or terminal

Visit **http://rasbt.github.io/mlxtend/#examples** for more information

# Data Preparation

In [1]:
import pandas as pd
import matplotlib as plt
%matplotlib inline
import csv

from mlxtend.preprocessing import OnehotTransactions

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_csv("data/association.csv")
df.head()

Unnamed: 0,Family,Hobbies,Social_Club,Political,Professional,Religious,Support_Group
0,1,0,0,0,0,0,0
1,0,0,0,0,0,1,1
2,1,1,0,0,1,0,0
3,0,0,0,0,0,0,0
4,0,0,0,1,1,0,1


The data is in tabular format with zeros and ones. The data needs to be transformed.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3483 entries, 0 to 3482
Data columns (total 7 columns):
Family           3483 non-null int64
Hobbies          3483 non-null int64
Social_Club      3483 non-null int64
Political        3483 non-null int64
Professional     3483 non-null int64
Religious        3483 non-null int64
Support_Group    3483 non-null int64
dtypes: int64(7)
memory usage: 190.5 KB


# Frequent Item Set Mining

In [4]:
apriori(df, min_support=0.1)

Unnamed: 0,support,itemsets
0,0.389894,[0]
1,0.300029,[1]
2,0.188056,[2]
3,0.324433,[4]
4,0.418605,[5]
5,0.158771,[6]
6,0.186908,"[0, 1]"
7,0.128912,"[0, 2]"
8,0.130347,"[0, 4]"
9,0.224519,"[0, 5]"


In [5]:
apriori(df, min_support=0.1, use_colnames=True).sort_values(['support'],ascending=False)

Unnamed: 0,support,itemsets
4,0.418605,[Religious]
0,0.389894,[Family]
3,0.324433,[Professional]
1,0.300029,[Hobbies]
11,0.238875,"[Hobbies, Religious]"
9,0.224519,"[Family, Religious]"
2,0.188056,[Social_Club]
6,0.186908,"[Family, Hobbies]"
5,0.158771,[Support_Group]
14,0.154752,"[Family, Hobbies, Religious]"


Findings from ItemMining

- Religous is the most popular organization people are participating in
- Religious and Hobbies are highly associated (832 people or 23% of the people have this)

In [6]:
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.389894,[Family],1
1,0.300029,[Hobbies],1
2,0.188056,[Social_Club],1
3,0.324433,[Professional],1
4,0.418605,[Religious],1
5,0.158771,[Support_Group],1
6,0.186908,"[Family, Hobbies]",2
7,0.128912,"[Family, Social_Club]",2
8,0.130347,"[Family, Professional]",2
9,0.224519,"[Family, Religious]",2


In [7]:
frequent_itemsets[ (frequent_itemsets['length'] == 3) &
                   (frequent_itemsets['support'] >= 0.1) ]

Unnamed: 0,support,itemsets,length
14,0.154752,"[Family, Hobbies, Religious]",3
15,0.102785,"[Family, Social_Club, Religious]",3
16,0.109676,"[Hobbies, Social_Club, Religious]",3


# Association Rules Mining

Support = Number of  Rows having both A AND B / Total Number of Rows
<br>
<br>
Confidence =  Number of Rows  having both A AND B / Number of Rows with A
<br>
<br>
Expected Confidence = Number of rows with B / Total Number of Rows
<br>
<br>
Lift = Confidence / Expected Confidence.
- A lift value greater than 1 : X and Y appear more often together than expected; this means that the occurrence of X has a positive effect on the occurrence of Y or that X is positively correlated with Y.
- A lift smaller than 1 : X and Y appear less often together than expected, this means that the occurrence of X has a negative effect on the occurrence of Y or that X is negatively correlated with Y
- A lift value near 1 : X and Y appear almost as often together as expected; this means that the occurrence of X has almost no effect on the occurrence of Y or that X and Y have Zero Correlation. 
- lift is a value between 0 and infinity


In [8]:
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.389894,[Family]
1,0.300029,[Hobbies]
2,0.188056,[Social_Club]
3,0.324433,[Professional]
4,0.418605,[Religious]
5,0.158771,[Support_Group]
6,0.186908,"[Family, Hobbies]"
7,0.128912,"[Family, Social_Club]"
8,0.130347,"[Family, Professional]"
9,0.224519,"[Family, Religious]"


In [9]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,"(Social_Club, Religious)",(Hobbies),0.147287,0.744639,2.481894
1,"(Social_Club, Hobbies)",(Religious),0.123457,0.888372,2.122222
2,(Social_Club),"(Religious, Hobbies)",0.188056,0.583206,2.441475
3,"(Religious, Family)",(Hobbies),0.224519,0.689258,2.297308
4,"(Religious, Hobbies)",(Family),0.238875,0.647837,1.661572
5,"(Family, Hobbies)",(Religious),0.186908,0.827957,1.977897
6,(Hobbies),"(Religious, Family)",0.300029,0.515789,2.297308
7,(Social_Club),(Religious),0.188056,0.783206,1.870992
8,(Social_Club),(Hobbies),0.188056,0.656489,2.188086
9,(Religious),(Hobbies),0.418605,0.570645,1.901967


In [10]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5).sort_values(['confidence'],ascending=False)

Unnamed: 0,antecedants,consequents,support,confidence,lift
1,"(Social_Club, Hobbies)",(Religious),0.123457,0.888372,2.122222
5,"(Family, Hobbies)",(Religious),0.186908,0.827957,1.977897
13,"(Social_Club, Family)",(Religious),0.128912,0.797327,1.904727
10,(Hobbies),(Religious),0.300029,0.796172,1.901967
7,(Social_Club),(Religious),0.188056,0.783206,1.870992
0,"(Social_Club, Religious)",(Hobbies),0.147287,0.744639,2.481894
12,"(Social_Club, Religious)",(Family),0.147287,0.697856,1.789861
3,"(Religious, Family)",(Hobbies),0.224519,0.689258,2.297308
11,(Social_Club),(Family),0.188056,0.685496,1.758161
8,(Social_Club),(Hobbies),0.188056,0.656489,2.188086


In [11]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)
rules

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,"(Social_Club, Religious)",(Hobbies),0.147287,0.744639,2.481894
1,"(Social_Club, Hobbies)",(Religious),0.123457,0.888372,2.122222
2,"(Religious, Hobbies)",(Social_Club),0.238875,0.459135,2.441475
3,(Social_Club),"(Religious, Hobbies)",0.188056,0.583206,2.441475
4,(Religious),"(Social_Club, Hobbies)",0.418605,0.262003,2.122222
5,(Hobbies),"(Social_Club, Religious)",0.300029,0.36555,2.481894
6,"(Religious, Family)",(Hobbies),0.224519,0.689258,2.297308
7,"(Religious, Hobbies)",(Family),0.238875,0.647837,1.661572
8,"(Family, Hobbies)",(Religious),0.186908,0.827957,1.977897
9,(Religious),"(Family, Hobbies)",0.418605,0.369684,1.977897


In [12]:
association_rules(frequent_itemsets, metric="lift", min_threshold=1.5).sort_values(['lift'],ascending=False)

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,"(Social_Club, Religious)",(Hobbies),0.147287,0.744639,2.481894
5,(Hobbies),"(Social_Club, Religious)",0.300029,0.36555,2.481894
2,"(Religious, Hobbies)",(Social_Club),0.238875,0.459135,2.441475
3,(Social_Club),"(Religious, Hobbies)",0.188056,0.583206,2.441475
23,(Social_Club),"(Religious, Family)",0.188056,0.546565,2.43438
22,"(Religious, Family)",(Social_Club),0.224519,0.457801,2.43438
11,(Hobbies),"(Religious, Family)",0.300029,0.515789,2.297308
6,"(Religious, Family)",(Hobbies),0.224519,0.689258,2.297308
15,(Hobbies),(Social_Club),0.300029,0.411483,2.188086
14,(Social_Club),(Hobbies),0.188056,0.656489,2.188086
