In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interp
%matplotlib inline

# Set random seed to ensure reproducability
random_state = 42

In [102]:
# Column names extracted from the dataset description file
cols = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", 
         "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", 
         "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", 
         "Horizontal_Distance_To_Fire_Points"] + \
       ["Wilderness_Area_{}".format(i) for i in range(4)] + \
       ["Soil_Type {}".format(i) for i in range(40)] + \
       ["Cover_Type"]

In [123]:
data = pd.read_csv('dataset/covtype.data', header=None, names=cols)

In [136]:
class_size= 10000
df = data.groupby("Cover_Type").apply(lambda x: x.sample(min(class_size, len(x)))).reset_index(1)
df = df.drop(columns=["Cover_Type"]).reset_index().drop(columns=["level_1"])

In [139]:
X = df.loc[:, df.columns != 'Cover_Type']
y = df[["Cover_Type"]]

In [143]:
binary_columns = df.columns[df.columns.map(lambda x: ("Area" in x) or ("Soil" in x))]

In [147]:
binX = X.loc[:, binary_columns]

In [161]:
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import LabelBinarizer

frequent_itemsets = {}
minpaterns = 50
minsup = 0.9
classrules = None
while minsup > 0:    
    minsup = minsup * 0.9
    frequent_itemsets = apriori(binX, min_support=minsup, use_colnames=True, verbose=0)
    if len(frequent_itemsets) < 1:
        continue
    
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
    classrules = rules[rules['consequents'] == {'class'}]
    
    print("Minsup {} found {} patterns out of which {} contain class consequent"
          .format(minsup, len(frequent_itemsets), len(classrules)))  
    
    if len(classrules) >= 100:
        print("Found {} rules which have class as consequent.".format(len(classrules)))
        break;
    
    if len(frequent_itemsets) >= minpaterns:
        print("Minimum support:", minsup)
        print("Number of found patterns:", len(frequent_itemsets))
        break

Minsup 0.43046721000000016 found 1 patterns out of which 0 contain class consequent
Minsup 0.38742048900000015 found 1 patterns out of which 0 contain class consequent
Minsup 0.34867844010000015 found 1 patterns out of which 0 contain class consequent
Minsup 0.31381059609000017 found 1 patterns out of which 0 contain class consequent
Minsup 0.28242953648100017 found 1 patterns out of which 0 contain class consequent
Minsup 0.25418658283290013 found 2 patterns out of which 0 contain class consequent
Minsup 0.22876792454961012 found 3 patterns out of which 0 contain class consequent
Minsup 0.2058911320946491 found 3 patterns out of which 0 contain class consequent
Minsup 0.1853020188851842 found 3 patterns out of which 0 contain class consequent
Minsup 0.16677181699666577 found 3 patterns out of which 0 contain class consequent
Minsup 0.1500946352969992 found 3 patterns out of which 0 contain class consequent
Minsup 0.13508517176729928 found 4 patterns out of which 0 contain class conseq

In [164]:
rules.sort_values(by=["support", "confidence"], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,(Soil_Type 28),(Wilderness_Area_0),0.1,0.26,0.1,1.0,3.8,0.07,4466.3
2,(Soil_Type 29),(Wilderness_Area_0),0.05,0.26,0.05,1.0,3.8,0.04,inf
3,(Soil_Type 3),(Wilderness_Area_2),0.05,0.47,0.05,0.86,1.83,0.02,3.68
8,(Soil_Type 31),(Wilderness_Area_2),0.05,0.47,0.05,0.95,2.03,0.02,9.93
9,(Soil_Type 32),(Wilderness_Area_2),0.05,0.47,0.04,0.96,2.05,0.02,12.41
5,(Soil_Type 12),(Wilderness_Area_2),0.04,0.47,0.04,1.0,2.13,0.02,150.46
12,(Soil_Type 5),(Wilderness_Area_3),0.04,0.23,0.04,1.0,4.31,0.03,inf
4,(Soil_Type 10),(Wilderness_Area_2),0.03,0.47,0.02,0.85,1.81,0.01,3.45
11,(Soil_Type 2),(Wilderness_Area_3),0.03,0.23,0.02,0.77,3.33,0.02,3.4
7,(Soil_Type 30),(Wilderness_Area_2),0.02,0.47,0.02,0.98,2.11,0.01,33.56
