In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jul-2022/data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop('id',axis =1,inplace = True)

categorical = df[list(col for col in df.columns if df[col].dtype == 'int64')]
numerical = df[list(col for col in df.columns if df[col].dtype == 'float64')]

In [None]:
sns.countplot(x=df.iloc[:,8])
plt.show()

plt.scatter(df.iloc[:,0], df.iloc[:,1],c=df.iloc[:,9],cmap='rocket')
plt.title('clustering of f_00 and f_01 with f_08 labels')
plt.show()

sns.set_style('whitegrid')
sns.lmplot('f_00','f_01',data=df,hue='f_08',palette = 'coolwarm',size =6,aspect=2,fit_reg=False)
plt.show()

sns.set_style('darkgrid')
g = sns.FacetGrid(df,hue='f_08',palette='cubehelix',size=6,aspect=2)
g.map(plt.hist,'f_00',bins=50,alpha=0.7)

# Clustering using KMeans

In [None]:
from sklearn.cluster import KMeans 
kmeans = KMeans(n_clusters=7, random_state=0).fit(df)
kmeans_labels = kmeans.labels_

In [None]:
kmeans.cluster_centers_

In [None]:
sns.countplot(x=kmeans_labels)

# Clustering Using GMM 

In [None]:
from sklearn import mixture
gmm = mixture.GaussianMixture(n_components=7,covariance_type='full').fit(df)
labels= gmm.predict(df)

sns.countplot(labels)

# Clustering Using BGM

In [None]:
bgm1 = mixture.BayesianGaussianMixture(n_components=7, weight_concentration_prior_type = 'dirichlet_distribution',covariance_type='full').fit(df)
bgm2 = mixture.BayesianGaussianMixture(n_components=7, weight_concentration_prior_type = 'dirichlet_distribution',covariance_type='diag').fit(df)
labels_trial1, labels_trial2 = bgm1.predict(df), bgm2.predict(df)

In [None]:
sns.countplot(labels_trial1)
plt.show()
sns.countplot(labels_trial2)

In [None]:
from sklearn.metrics import mean_squared_error 
mean_squared_error(labels_trial1,labels_trial2, squared=False)

In [None]:
predict_proba = bgm1.predict_proba(df)
predict_proba

In [None]:
plt.scatter(df.iloc[:,0], df.iloc[:,1],c=labels_trial1,cmap='rocket')
plt.title('clustering of f_00 and f_01 with labels_trial1')
plt.show()
plt.scatter(df.iloc[:,0], df.iloc[:,1],c=labels_trial2,cmap='cubehelix')
plt.title('clustering of f_00 and f_01 with labels_trial2')
plt.show()


## Clustering using LightGBM

In [None]:
training = df.copy()

In [None]:
predict_proba

In [None]:
training['predict'] = labels_trial1
training['predict_proba'] = 0
for i in range(7):
    training[f'predict_proba_{i}']=predict_proba[:,i]
    training.loc[training.predict == i,'predict_proba'] = training[f'predict_proba_{i}']

In [None]:
train_ind = np.array([])
for i in range(7):
    ind = training[(training.predict == i) & (training.predict_proba > 0.7)].index 
    train_ind = np.concatenate((train_ind,ind))

In [None]:
X, y = df.loc[train_ind], training.loc[train_ind,'predict']

In [None]:
from lightgbm import LGBMClassifier 
lgblabels = LGBMClassifier(boosting_type='gbdt',learning_rate=0.07,random_state=42,objective='multiclass',num_classes=7,n_estimators=5000).fit(X,y)
#lgblabels = lgb.predict(df)

In [None]:
from sklearn.model_selection import StratifiedKFold 
skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
label_list = []
for train_idx, test_idx in skf.split(X,y):
        lgblabels = LGBMClassifier(boosting_type='gbdt',learning_rate=0.07,objective='multiclass',num_classes=7).fit(X.iloc[train_idx],y.iloc[train_idx])
        label_list.append(lgblabels)

In [None]:
lgb_labels2 = 0
for label in label_list:
    lgb_labels2 += lgblabels.predict(df)

In [None]:
lgb_labels2 = np.array(lgb_labels2/10)


In [None]:
lgb_labels2= lgb_labels2.astype('int32')

In [None]:
sns.countplot(lgb_labels2)

In [None]:
import lightgbm as lgb 
params_dic = {'learning_rate':0.07,'objective':'multiclass','boosting':'gbdt','n_jobs':-1,'num_classes':7,'verbosity':-1}
tr_dataset = lgb.Dataset(X,y)
model = lgb.train(params=params_dic,
                  train_set=tr_dataset,
                  num_boost_round=5000)

In [None]:
lgb_labels = model.predict(df)

In [None]:
sns.countplot(lgb_labels)

In [None]:
mean_squared_error(lgb_labels2,lgb_labels, squared=False)

# Ensembling Clustering Algorithms

In [None]:
from collections import defaultdict
from scipy.sparse import csr_matrix 
from sklearn.metrics import adjusted_rand_score
from tqdm import trange 

In [None]:
clusters = [lgb_labels2,lgb_labels]

clust_tup_list = []
for clus in zip(*clusters):
    clust_tup_list.append(clus)

zipper = {x:i for i,x in enumerate(sorted(set(clust_tup_list)))}
zipper_lst = [zipper[x] for x in clust_tup_list]
unzipped = defaultdict(set)
for idx, clust in enumerate(clust_tup_list):
    zip_val = zipper[clust]
    unzipped[zip_val].add(idx)

In [None]:
[[-1]*3 for _ in range(5)] # To make a (5,3) array

In [None]:
comp_clusters_list = [[-1]*len(zipper) for _ in range(len(clusters))]
for clust, comp_clusters in zip(clusters, comp_clusters_list):
    for i, cluster_i in enumerate(clust):
            val = zipper_lst[i]
            comp_clusters[val] = cluster_i

In [None]:
cl1 = [1,1,1,2,2,3]
row = []
col = []
data = []
for i in range(len(cl1)):
    for j in range(i+1,len(cl1)):
        if cl1[i] == cl1[j]:
            data.append(1)
            row.append(i)
            col.append(j)
print(row,col,data)
csr_matrix((data, (row,col)),shape=(6,6)).toarray() # Sample sparse matrix

In [None]:
def create_sparse_mat(clust):
    n = len(clust)
    data=[]
    row=[]
    col=[]
    for i in trange(n):
        for j in range(i+1,n):
            if clust[i] == clust[j]:
                data.append(1)
                row.append(i)
                col.append(j)
    return csr_matrix((data, (row,col)),shape=(n,n))  

comp_sparse_matrix = [create_sparse_mat(comp_clust) for comp_clust in comp_clusters_list]

In [None]:
comp_sparse_mat = 0
for i in range(len(comp_sparse_matrix)):
    comp_sparse_mat += comp_sparse_matrix[i]*0.5
    
#comp_sparse_mat = (comp_sparse_matrix[0]*0.5+comp_sparse_matrix[1]*0.5+comp_sparse_matrix[2]*0.5)
comp_sparse_mat.toarray()

In [None]:
thrs_val = 0.5
comp_sparse_mat[comp_sparse_mat<thrs_val] = 0

comp_sparse_mat = comp_sparse_mat.toarray()
comp_sparse_mat

In [None]:
cluster_final = [0]*df.shape[0]
node_end = len(comp_clusters_list[0])
cluter_final_id_no = 0
edge_list = []
for fr in range(node_end):
    for to in range(fr, node_end):
        node = comp_sparse_mat[fr][to]
        if node == 0:
            continue 
        edge_list.append([node,fr,to])
        
edge_list.sort(reverse=True)

siz_max = 18000
clu_max_num = 7

## Using DISJOIN SET UNION 
1. To combine two sets (operation union_sets(a, b)), we first find the representative of the set in which a is located, and the representative of the set in which b is located. If the representatives are identical, that we have nothing to do, the sets are already merged. Otherwise, we can simply specify that one of the representatives is the parent of the other representative - thereby combining the two
2. union(a, b) - merges the two specified sets (the set in which the element a is located, and the set in which the element b is located)

*Path compression optimization*
This optimization is designed for speeding up find_val.

If we call find_val(v) for some vertex v, we actually find the representative p for all vertices that we visit on the path between v and the actual representative p. The trick is to make the paths for all those nodes shorter, by setting the parent of each visited vertex directly to p.

**NOTE** : Here the union operator works on the basis of size/rank, the tree with smaller rank is attached to the tree with the larger rank

In [None]:
class DSU:
    def __init__(self, node_end,unzip):
        self.indval = [_ for _ in range(node_end)]
        self.siz = [len(unzip[_]) for _ in range(node_end)]
        self.cls_size = node_end
    
    def find_val(self,x):
        if self.indval[x] == x: return x
        self.indval[x] = self.find_val(self.indval[x])
        return self.indval[x]
    
    def get_siz(self,x):
        x = self.find_val(x)
        return self.siz[x]
    
    def union(self,x,y):
        x,y = self.find_val(x) ,self.find_val(y)
        siz_x, siz_y = self.get_siz(x), self.get_siz(y)
        if x == y: return 
        if siz_x > siz_y : x, y = y,x
        self.indval[x] = y 
        self.siz[y] += self.siz[x]
        self.cls_size -= 1
                   

In [None]:
dsu = DSU(node_end, unzipped)
for v, fr, to in edge_list:
    if dsu.get_siz(fr) + dsu.get_siz(to) > siz_max : continue 
    dsu.union(fr,to)
    if dsu.cls_size <= clu_max_num : 
        print("Maximum number of clusters reached:{}".format(clu_max_num))
        break

for node in trange(node_end):
    cluster_id = dsu.find_val(node)
    clus_lst = unzipped[node]
    for ind_val in clus_lst:
        cluster_final[ind_val] = cluster_id


In [None]:
zipper_final = {x:i for i,x in enumerate(sorted(set(cluster_final)))}
clusters_final = [zipper_final[i] for i in cluster_final]

sns.countplot(clusters_final)

In [None]:
def compare_clusterings(y1, y2, title=''):
    """Show the adjusted rand score and plot the two clusterings in color"""
    ars = adjusted_rand_score(y1, y2)
    n1 = y1.max() + 1
    n2 = y2.max() + 1
    argsort = np.argsort(y1*100 + y2) if n1 >= n2 else np.argsort(y2*100 + y1)
    plt.figure(figsize=(16, 0.5))
    for i in range(6, 11):
        plt.scatter(np.arange(len(y1)), np.full_like(y1, i), c=y1[argsort], s=1, cmap='tab10')
    for i in range(5):
        plt.scatter(np.arange(len(y2)), np.full_like(y2, i), c=y2[argsort], s=1, cmap='tab10')
    plt.gca().axis('off')
    plt.title(f'{title}\nAdjusted Rand score: {ars:.5f}')
    plt.savefig(title + '.png', bbox_inches='tight')
    plt.show()
    
for clust in clusters:
    compare_clusterings(np.array(clust),np.array(clusters_final))

In [None]:
mean_squared_error(clusters_final,lgb_labels, squared=False)

In [None]:
res = pd.read_csv('../input/tabular-playground-series-jul-2022/sample_submission.csv')
res.head()

In [None]:
res.drop('Predicted',axis =1,inplace = True)

In [None]:
res['Predicted'] = clusters_final

In [None]:
res['Predicted'] = lgb_labels2

In [None]:
res

In [None]:
res.to_csv('MyOutputbgm_lgbm2',index=False)