### Installing required packages

In [None]:
!pip install -r requirements.txt

### Importing Libraries

In [4]:
import pandas as pd
import numpy as np
np.random.seed(seed=0)
import requests
import json
import csv
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import adjusted_rand_score

#### Importing training and testing files

In [7]:
%time
train_df = pd.read_csv('Data/data_tr.txt',delimiter="\t", header=None)
test_df = pd.read_csv('Data/data_ts.txt',delimiter="\t", header=None)


Wall time: 0 ns


#### Defining true labels

In [61]:
labs = []
with open('Data/labels_ts.txt','r') as f:
    reader = csv.reader(f, dialect='excel',delimiter= '\t')
    for i in reader:
        labs.append(i)
true_labels = [val for sublist in labs for val in sublist]

#### Finding high correlation features

In [69]:
%%time

# This cell takes a lot of time as it calculates correlation for 13k samples.
# I have divided the dataframe into 3 parts and took correlation individually to sped up the execution.

df_1 = train_df.iloc[:,0:5000]
df_2 = train_df.iloc[:,5000:10000]
df_3 = train_df.iloc[:,10000:]


corr_1 = df_1.corr()
upper_triangle_1 = corr_1.where(np.triu(np.ones(corr_1.shape),k=1).astype(np.bool))
to_drop_1 = [column for column in upper_triangle_1.columns if any(upper_triangle_1[column] > 0.8)]

corr_2 = df_2.corr()
upper_triangle_2 = corr_2.where(np.triu(np.ones(corr_2.shape),k=1).astype(np.bool))
to_drop_2 = [column for column in upper_triangle_2.columns if any(upper_triangle_2[column] > 0.8)]

corr_3 = df_3.corr()
upper_triangle_3 = corr_3.where(np.triu(np.ones(corr_3.shape),k=1).astype(np.bool))
to_drop_3 = [column for column in upper_triangle_3.columns if any(upper_triangle_3[column] > 0.8)]

cols_to_drop = to_drop_1 + to_drop_2 + to_drop_3
print("Features with more than or equal to 0.8 correlation: ",cols_to_drop)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Features with more than or equal to 0.8 correlation:  [1099, 1113, 1875, 2093, 2446, 2620, 2937, 3141, 4431, 9586, 10778]
Wall time: 1h 14min 4s


#### Dropping columns having high correlation

In [70]:
%%time
# From the above cell, we got the [1099, 1113, 1875, 2093, 2446, 2620, 2937, 3141, 4431, 9586, 10778] columns as highly correlated

# cols_to_drop = [1099, 1113, 1875, 2093, 2446, 2620, 2937, 3141, 4431, 9586, 10778]

new_train_df = train_df.drop(cols_to_drop,axis = 1)
new_test_df = test_df.drop(cols_to_drop,axis = 1)

Wall time: 3.01 s


#### Data preprocessing

In [71]:
%%time
np.random.seed(0)
# fit scaler on training data
norm = MinMaxScaler(feature_range=(0, 10))
norm_fit = norm.fit(new_train_df)

# transform training data
X_train_norm = norm_fit.transform(new_train_df)

X_test_norm = norm.transform(new_test_df)

Wall time: 4.1 s


#### Dimensionality reduction using PCA

In [72]:
%%time
pca_100 = PCA(n_components=100, random_state=0)
pca_100_fit = pca_100.fit(X_train_norm)
X_train_norm_100 = pca_100_fit.transform(X_train_norm)
X_test_norm_100 = pca_100_fit.transform(X_test_norm)

Wall time: 23.8 s


#### Model training and evaluation

In [73]:
np.random.seed(seed=0)
model_birch = Birch(n_clusters=16,threshold=3.5,branching_factor=400)
model_birch_fit = model_birch.fit(X_train_norm_100)
# prediction = model_birch_fit.predict(X_train_norm_100)

prediction = model_birch_fit.predict(X_test_norm_100)
print("The adjusted rand score is: ",adjusted_rand_score(true_labels,prediction.tolist()))

The adjusted rand score is:  0.9034293886247964
