In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interp
%matplotlib inline

# Set random seed to ensure reproducability
random_state = 42

In [2]:
# Column names extracted from the dataset description file
cols = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", 
         "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", 
         "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", 
         "Horizontal_Distance_To_Fire_Points"] + \
       ["Wilderness_Area_{}".format(i) for i in range(4)] + \
       ["Soil_Type {}".format(i) for i in range(40)] + \
       ["Cover_Type"]

In [3]:
data = pd.read_csv('dataset/covtype.data', header=None, names=cols)

# Data preparation
- **subsample the data to 10k samples**

In [4]:
class_size= 10000
df = data.groupby("Cover_Type").apply(lambda x: x.sample(min(class_size, len(x)))).reset_index(1)
df = df.drop(columns=["Cover_Type"]).reset_index().drop(columns=["level_1"])

In [20]:
X = df.loc[:, df.columns != 'Cover_Type']
y = df[["Cover_Type"]]

### Get the binary columns

In [8]:
binary_columns = df.columns[df.columns.map(lambda x: ("Area" in x) or ("Soil" in x))]
binX = X.loc[:, binary_columns]

### Get the numeric columns

In [26]:
numeric_columns = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", 
         "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", 
         "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", 
         "Horizontal_Distance_To_Fire_Points"]
numX = X.loc[:, numeric_columns]
numX = pd.concat([numX, y], axis=1)

In [36]:
dumX = pd.DataFrame()

for col in numX:
    cutdf = pd.cut(numX[col], 3, labels=[0, 1, 2])
    dummydf = pd.get_dummies(cutdf)
    seriesname = dummydf.columns.name
    dummydf.columns = [seriesname + "_low", seriesname + "_mid", seriesname + "_high"]
    dumX = pd.concat([dumX, dummydf], axis=1)

In [48]:
print("Shape of the dummyfied dataset:", dumX.shape)
print("Shape of the binary dataset: ", allX.shape)

Shape of the dummyfied dataset: (62240, 33)
Shape of the binary dataset:  (62240, 77)


In [37]:
allX = pd.concat([dumX, binX], axis=1)

# Unsupervised learning

## Association rules

In [45]:
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import LabelBinarizer

frequent_itemsets = {}
minpaterns = 1000
minsup = 0.9
classrules = None
while minsup > 0:    
    minsup = minsup * 0.9
    frequent_itemsets = apriori(allX, min_support=minsup, use_colnames=True, verbose=3)
    
    if len(frequent_itemsets) < 1:
        continue
    
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
    classrules = rules[rules['consequents'] == {'Cover_Type'}]
    
    print("Minsup {} found {} patterns out of which {} contain class consequent"
          .format(minsup, len(frequent_itemsets), len(classrules)))  
    
    if len(classrules) >= 100:
        print("Found {} rules which have Cover_Type as consequent.".format(len(classrules)))
        break;
    
    if len(frequent_itemsets) >= minpaterns:
        print("Minimum support:", minsup)
        print("Number of found patterns:", len(frequent_itemsets))
        break

Processing 6 combinations | Sampling itemset size 2
Minsup 0.81 found 3 patterns out of which 0 contain class consequent
Processing 15 combinations | Sampling itemset size 3
Minsup 0.7290000000000001 found 14 patterns out of which 0 contain class consequent
Processing 4 combinations | Sampling itemset size 43
Minsup 0.6561000000000001 found 23 patterns out of which 0 contain class consequent
Processing 48 combinations | Sampling itemset size 43
Minsup 0.5904900000000002 found 44 patterns out of which 0 contain class consequent
Processing 60 combinations | Sampling itemset size 54
Minsup 0.5314410000000002 found 67 patterns out of which 0 contain class consequent
Processing 105 combinations | Sampling itemset size 5
Minsup 0.47829690000000014 found 100 patterns out of which 0 contain class consequent
Processing 36 combinations | Sampling itemset size 65
Minsup 0.43046721000000016 found 146 patterns out of which 0 contain class consequent
Processing 7 combinations | Sampling itemset size

$$\text{Workflow of getting the rules into }\LaTeX$$
1. Generate this csv as shown below
2. Copy it to the https://www.tablesgenerator.com/latex_tables and under File->Paste table data paste the data
3. Copy the data to the Latex
4. Replace "-" with \\ to do the line breaks
5. Style the table and resize it correctly

In [46]:
select = rules
select = select.sort_values(by=["confidence", "support", "leverage"], ascending=False)\
    [["antecedents", "consequents", "confidence", "support", "leverage"]].head(5).round(3)

select = select.rename(columns={"confidence": "conf", "support": "supp", "leverage": "lever"})
print(select.to_csv(index=False, sep=";").replace("frozenset({", "").replace("})", "").replace("'", "")\
     .replace(", ", "-"))

antecedents;consequents;conf;supp;lever
Aspect_low-Hillshade_Noon_high;Hillshade_9am_high;1.0;0.379;0.041
Aspect_low-Slope_low;Hillshade_9am_high;1.0;0.368;0.04
Aspect_low-Slope_low-Hillshade_Noon_high;Hillshade_9am_high;1.0;0.358;0.039
Aspect_low-Hillshade_3pm_mid-Slope_low;Hillshade_9am_high;1.0;0.34;0.037
Aspect_low-Hillshade_3pm_mid-Hillshade_Noon_high;Hillshade_9am_high;1.0;0.333;0.036



## Clustering
- include the silhouette analysis
- whatever score
- contingency matrix