In [None]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt 
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, AgglomerativeClustering, DBSCAN, Birch
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.compose import ColumnTransformer
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import f1_score
import scipy.cluster.hierarchy as sch
from itertools import permutations

In [None]:
df = pd.read_csv("covtype_train.csv")

# Preprocessing

## Converting columns having categorical values into numeric form

In [None]:
column_to_cat = {
    "Elevation":["elevation_low", "elevation_medium", "elevation_high", "elevation_ultra"],
     "Aspect":["aspect_low", "aspect_medium", "aspect_high", "aspect_ultra"],
     "Slope":["slope_low", "slope_medium", "slope_high", "slope_ultra"],
     "Hillshade_9am":["hillshade_9am_min", "hillshade_9am_max"],
     "Hillshade_Noon":["hillnoon_min", "hillnoon_max"],
     "Horizontal_Distance_To_Fire_Points":["low", "mid", "high"]
}
df['Elevation'] = OrdinalEncoder(categories = [column_to_cat['Elevation']]).fit_transform(df[['Elevation']])
df['Aspect'] = OrdinalEncoder(categories = [column_to_cat['Aspect']]).fit_transform(df[['Aspect']])
df['Slope'] = OrdinalEncoder(categories = [column_to_cat['Slope']]).fit_transform(df[['Slope']])
df['Hillshade_9am'] = OrdinalEncoder(categories = [column_to_cat['Hillshade_9am']]).fit_transform(df[['Hillshade_9am']])
df['Hillshade_Noon'] = OrdinalEncoder(categories = [column_to_cat['Hillshade_Noon']]).fit_transform(df[['Hillshade_Noon']])
df['Horizontal_Distance_To_Fire_Points'] = OrdinalEncoder(categories = [column_to_cat['Horizontal_Distance_To_Fire_Points']]).fit_transform(df[['Horizontal_Distance_To_Fire_Points']])
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness,target
0,1.0,1.0,0.0,1.0,1.0,0,1,0.0,22,0,2
1,2.0,1.0,0.0,1.0,1.0,1,1,1.0,32,2,1
2,1.0,0.0,0.0,1.0,1.0,1,1,0.0,10,2,2
3,2.0,3.0,1.0,1.0,1.0,2,1,0.0,23,2,1
4,2.0,2.0,0.0,1.0,1.0,2,1,1.0,28,0,2


## Reducing dimensionality of the dataset by combining two columns

In [None]:
df['Distance_To_Hydrology'] = np.sqrt(np.square(df['Horizontal_Distance_To_Hydrology']) + np.square(df['Vertical_Distance_To_Hydrology']))
df = df.drop(columns=['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'])

## Basic Analysis

In [None]:
len(df)

406708

In [None]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Elevation,Aspect,Slope,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness,target,Distance_To_Hydrology


In [None]:
df.isnull().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Hillshade_9am                         0
Hillshade_Noon                        0
Horizontal_Distance_To_Fire_Points    0
Soil_Type                             0
Wilderness                            0
target                                0
Distance_To_Hydrology                 0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,Elevation,Aspect,Slope,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness,target,Distance_To_Hydrology
count,406708.0,406708.0,406708.0,406708.0,406708.0,406708.0,406708.0,406708.0,406708.0,406708.0
mean,1.700788,1.243135,0.344242,0.990138,0.999294,0.348181,23.358461,1.114679,2.051472,1.660496
std,0.60693,1.185529,0.518016,0.098817,0.026555,0.581352,9.483622,1.061301,1.396507,0.911737
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,1.0,0.0,0.0,1.0,1.0,0.0,19.0,0.0,1.0,1.0
50%,2.0,1.0,0.0,1.0,1.0,0.0,28.0,1.0,2.0,1.414214
75%,2.0,2.0,1.0,1.0,1.0,1.0,30.0,2.0,2.0,2.236068
max,3.0,3.0,3.0,1.0,1.0,2.0,39.0,3.0,7.0,7.211103


In [None]:
X = df.drop(columns=['target'])
y = df['target'].copy()
df = df.drop(columns=['target'])
Xdf = pd.DataFrame(X,columns=df.columns)

In [None]:
rus = RandomUnderSampler() 
X, y = rus.fit_resample(X, y)

## Scaling

In [None]:
scaler = MinMaxScaler()
scaler.fit(X)
X=scaler.transform(X)

# Question 1

## Part  1

### K Means

In [None]:
kmeans = KMeans(n_clusters=7) 
y_kmeans = kmeans.fit(X)
labels_km = kmeans.labels_

In [None]:
#centroids
cluster_centers = kmeans.cluster_centers_ 
cluster_centers

array([[ 1.84590690e-01,  8.88576779e-01,  2.34082397e-01,
         9.42616372e-01,  1.00000000e+00,  8.82825040e-03,
         2.02689632e-01,  8.87506688e-01,  2.03386023e-01],
       [ 5.47660648e-01,  1.68316832e-01,  1.50067948e-01,
         1.00000000e+00,  9.96505533e-01, -1.15185639e-15,
         6.93562116e-01,  9.70685304e-03,  2.10885339e-01],
       [ 6.96285387e-01,  9.03943708e-01,  9.70381280e-02,
         9.90181640e-01,  9.99509082e-01,  1.14383898e-01,
         7.66801354e-01,  4.94190803e-01,  2.69637306e-01],
       [ 3.63073749e-01,  1.66408802e-01,  1.27557160e-01,
         1.00000000e+00,  1.00000000e+00,  3.55853533e-02,
         2.00023803e-01,  7.15660994e-01,  2.05286355e-01],
       [ 7.18482029e-01,  1.38295931e-01,  1.19503740e-01,
         1.00000000e+00,  1.00000000e+00,  1.13847838e-01,
         8.36343733e-01,  6.37839810e-01,  2.70838016e-01],
       [ 6.16406701e-01,  2.44945118e-01,  8.23223570e-02,
         9.99133449e-01,  1.00000000e+00,  6.239168

### Birch Clustering

In [None]:
model_br = Birch(n_clusters=7).fit(X)
y_predict_br = model_br.fit_predict(X)
labels_br = model_br.labels_

In [None]:
clf_br = NearestCentroid()
clf_br.fit(X, y_predict_br)
print(clf_br.centroids_)

[[0.24720975 0.46341672 0.19593757 1.         0.99948685 0.01205901
  0.25761747 0.83946975 0.20469488]
 [0.57647776 0.31007516 0.11974406 1.         0.99817185 0.23826935
  0.68885451 0.01401584 0.20119827]
 [0.89767932 0.89820675 0.07700422 1.         1.         0.21281646
  0.93443687 0.62552743 0.36108859]
 [0.76784215 0.15491184 0.12111671 1.         1.         0.15018892
  0.85845766 0.6125105  0.29145033]
 [0.08277405 0.9753915  0.54362416 0.         1.         0.
  0.24987093 0.93512304 0.21722573]
 [0.66666667 1.         0.66666667 0.         0.         0.5
  0.71794872 0.         0.13867505]
 [0.73809524 0.97619048 0.52380952 0.         1.         0.17857143
  0.83699634 0.28571429 0.31357456]]


### Agglomerative Clustering

In [None]:
model_ac = AgglomerativeClustering(n_clusters=7, affinity='euclidean', linkage='ward')
y_predict_ac = model_ac.fit_predict(X)
labels_ac = model_ac.labels_

In [None]:
clf_ac = NearestCentroid()
clf_ac.fit(X, y_predict_ac)
print(clf_ac.centroids_)

[[0.1704162  0.97000375 0.21991001 0.90776153 0.99943757 0.00168729
  0.22658706 0.90438695 0.20234008]
 [0.0711202  0.22937969 0.24801182 1.         1.         0.
  0.13489941 1.         0.19739042]
 [0.68500363 0.93990559 0.1040305  1.         1.         0.16285403
  0.74741635 0.41394336 0.2608123 ]
 [0.61374637 0.19619232 0.07776702 1.         1.         0.60648596
  0.68615186 0.00709906 0.19963133]
 [0.71216456 0.22435635 0.11006675 1.         1.         0.09521864
  0.80931124 0.61163329 0.27413963]
 [0.36498973 0.27344285 0.1866872  1.         0.99486653 0.04645791
  0.19798084 0.66478439 0.20135995]
 [0.51986532 0.13198653 0.15308642 1.         1.         0.
  0.69134076 0.         0.19987352]]


### Gaussian Mixture Model

In [None]:
model_gm = GaussianMixture(n_components=7).fit(X)
y_predict_gm = model_gm.fit_predict(X)
labels_gm = model_gm.predict(X)


In [None]:
clf_gm = NearestCentroid()
clf_gm.fit(X, y_predict_gm)
print(clf_gm.centroids_)

[[0.84170854 0.40954774 0.11809045 1.         1.         0.12437186
  0.83371988 0.33333333 0.26824569]
 [0.18399327 0.43229742 0.21638534 1.         1.         0.
  0.17163995 0.88189515 0.20138005]
 [0.69444444 0.09722222 0.45833333 1.         0.58333333 0.
  0.80128205 0.15277778 0.50145963]
 [0.66666667 1.         0.07971014 0.84782609 0.97826087 0.06521739
  0.59476031 0.2826087  0.20391286]
 [0.57707676 0.3086225  0.11735016 1.         1.         0.23580442
  0.69220254 0.         0.20381301]
 [0.11889597 0.97452229 0.54352442 0.         1.         0.
  0.28107137 0.91295117 0.22772168]
 [0.67097485 0.45706843 0.1081054  1.         1.         0.13585813
  0.77173553 0.66666667 0.26969981]]


## Part 2

### K Means

In [None]:
clusters=pd.DataFrame(X,columns=Xdf.columns)
clusters['predicted_target']= y_kmeans.labels_
group = clusters.groupby("predicted_target")
group = group.mean().reset_index()
cluster_groups=pd.melt(group,id_vars=["predicted_target"])
fig = px.line_polar(cluster_groups, r="value", theta="variable", color="predicted_target", line_close=True, title= "Visualisation of k-means clustering")
fig.show()

### Birch Clustering

In [None]:
clusters_bc =pd.DataFrame(X,columns=Xdf.columns)
clusters_bc['predicted_target']= labels_br
group = clusters_bc.groupby("predicted_target")
group = group.mean().reset_index()
cluster_groups=pd.melt(group,id_vars=["predicted_target"])
fig = px.line_polar(cluster_groups, r="value", theta="variable", color="predicted_target", line_close=True, title= "Visualisation of Birch Clustering")
fig.show()

### Agglomerative Clustering

In [None]:
clusters_ac =pd.DataFrame(X,columns=Xdf.columns)
clusters_ac['predicted_target']= labels_ac
group = clusters_ac.groupby("predicted_target")
group = group.mean().reset_index()
cluster_groups=pd.melt(group,id_vars=["predicted_target"])
fig = px.line_polar(cluster_groups, r="value", theta="variable", color="predicted_target", line_close=True, title= "Visualisation of Agglomerative Clustering clustering")
fig.show()

### Gaussian Mixture Model

In [None]:
clusters_gm =pd.DataFrame(X,columns=Xdf.columns)
clusters_gm['predicted_target']= labels_gm
group = clusters_gm.groupby("predicted_target")
group = group.mean().reset_index()
cluster_groups=pd.melt(group,id_vars=["predicted_target"])
fig = px.line_polar(cluster_groups, r="value", theta="variable", color="predicted_target", line_close=True, title= "Visualisation of Gaussian Mixture Model clustering")
fig.show()

### 

## Part 3

In [None]:
def mapping(test_labels,y_test):
    perm = permutations([1, 2, 3, 4, 5, 6, 7])
    reference_labels = []
    score = 0
    for i in list(perm):
        relabel = np.choose(test_labels,i).astype(np.int64)
        new_score = f1_score(y_test, relabel, average='weighted')
        if new_score > score:
            reference_labels = i
            score = new_score
    #print(reference_labels)
    return score

### K means

In [None]:
print("AMI score for K-means Clustering Model and target label:")
print(adjusted_mutual_info_score(y, labels_km)) 
print("F1 score for  K-means model:")
print(mapping(labels_km,y)) 

AMI score for K-means Clustering Model and target label:
0.3271954656782891
F1 score for  K-means model:
0.39383861774656026


### Birch Clustering

In [None]:
print("AMI score for Birch Clustering and target label:")
print(adjusted_mutual_info_score(y, labels_br)) 
print("F1 score for Birch Clustering Model:")
print(mapping(labels_br,y)) 

AMI score for Birch Clustering and target label:
0.29693917836667943
F1 score for Birch Clustering Model:
0.21335063105938967


### Agglomerative Clustering

In [None]:
print("AMI score for Agglomerative Clustering and target label:")
print(adjusted_mutual_info_score(y, labels_ac))
print("F1 score for Agglomerative Clustering Model:")
print(mapping(labels_ac,y))  

AMI score for Agglomerative Clustering and target label:
0.34335008076215795
F1 score for Agglomerative Clustering Model:
0.3956528769418069


### Gaussian Mixture Model

In [None]:
print("AMI score for Gaussian Mixture and target label:")
print(adjusted_mutual_info_score(y, labels_gm))
print("F1 score for Gaussian Mixture Model:")
print(mapping(labels_gm,y)) 

AMI score for Gaussian Mixture and target label:
0.3289733493692009
F1 score for Gaussian Mixture Model:
0.2206982189791166


## Part 4 

In [None]:
#Gaussian model and Agglomerative Clustering
print(adjusted_mutual_info_score(labels_ac, labels_gm))

0.5574392687684919


In [None]:
#Gaussian model and Birch Clustering
print(adjusted_mutual_info_score(labels_br, labels_gm))

0.46241293720836163


In [None]:
#Gaussian model and K-means Clustering
print(adjusted_mutual_info_score(labels_km, labels_gm))

0.5700402507139478


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7f51241e-cd4a-4305-b6ca-3ee7cda73020' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>