## Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dirty_completeness

## Useful Functions

In [None]:
def hists_plot(df):
    fig, axes = plt.subplots(1, len(df.columns), figsize=(20,5))
    ax = axes.flatten()
    for i, col in enumerate(df.columns):
        sns.countplot(df, x=col, ax=axes[i], order=sorted(df[col].value_counts().index))

In [None]:
def accuracy(df, df_imputed):
    accuracy = {}
    for col in df.columns:
        equal = np.where(df[col] == df_imputed[col], True, False)
        accuracy[col] = equal[equal == True].sum() / len(equal)
    return accuracy

In [None]:
def jaccard_dist(a,b):
    intersction =len(set(a).intersection(b))
    union = len(a) + len(b) -intersction
    return 1 - float(intersction) / union

## Read data

In [None]:
users = pd.read_csv('users.csv')

In [None]:
hists_plot(users)

## Shuffle data

In [None]:
users = users.sample(frac=1, axis=0).reset_index(drop=True)

## Inject null values

In [None]:
users_dirty = np.array(dirty_completeness.injection(df_pandas=users, seed=1234, name = 'users', name_class="none"))

In [None]:
users_dirty = users_dirty[0,:,:] #50% completeness

In [None]:
users_dirty = pd.DataFrame(users_dirty,columns=users.columns)

In [None]:
users_dirty

## Visualizing null values

In [None]:
sns.heatmap(users_dirty.isna())

## Imputation

### Simple Imputation FFILL&BFILL

In [None]:
users_simple_imp = users_dirty.fillna(method='ffill')
users_simple_imp.isna().count()
users_simple_imp = users_simple_imp.fillna(method='bfill')

In [None]:
hists_plot(users)
hists_plot(users_dirty)
hists_plot(users_simple_imp)

In [None]:
accuracy(users, users_simple_imp)

### Advanced Imputation: KNN

In [None]:
users_dirty_one_hot = pd.get_dummies(users_dirty)
for col in users_dirty.columns:
    users_dirty_one_hot.loc[users_dirty[col].isnull(), users_dirty_one_hot.columns.str.startswith(col)] = np.nan

In [None]:
from sklearn.impute import KNNImputer

In [None]:
knn_imputer = KNNImputer(n_neighbors=4)
users_knn_imp_one_hot = pd.DataFrame(knn_imputer.fit_transform(users_dirty_one_hot))
users_knn_imp_one_hot.columns = users_dirty_one_hot.columns

In [None]:
users_knn_imp = pd.DataFrame()
for col in users_dirty.columns:
   users_knn_imp[col] = users_knn_imp_one_hot.loc[:, users_knn_imp_one_hot.columns.str.startswith(col)].idxmax(1)
users_knn_imp = users_knn_imp.apply(lambda e: e.str[3:] )

In [None]:
hists_plot(users)
hists_plot(users_knn_imp)

In [None]:
accuracy(users,users_knn_imp)

## Clustering

### 1. K-Modes

In [None]:
%pip install kmodes

In [None]:
from kmodes.kmodes import KModes

In [None]:
# Elbow curve to find optimal K
def elbow(df, k_max=16):
    cost = []
    K = range(1,k_max)
    for num_clusters in list(K):
        kmode = KModes(n_clusters=num_clusters, init = "random", n_init = 5, verbose=0)
        kmode.fit_predict(df)
        cost.append(kmode.cost_)
        
    plt.plot(K, cost, 'bx-')
    plt.xlabel('No. of clusters')
    plt.ylabel('Cost')
    plt.title('Elbow Method For Optimal k')


In [None]:
elbow(users)

According to the elbow plot we get the optimal number of cluster k=5

In [None]:
kmodes = KModes(n_jobs = -1, n_clusters = 5, init = 'random', random_state = 0)
kmodes.fit_predict(users_knn_imp)

#### Cluster Personas

In [None]:
print(kmodes.cluster_centroids_)

In [None]:
users_cluster_kmodes = pd.DataFrame.copy(users)
users_cluster_kmodes['Cluster'] = kmodes.labels_

In [None]:
users_cluster_kmodes

### 2. Agglomerative Hierarchical Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

#### Compute the distance matrix using Jaccard Distance

In [None]:
dist_matrix = np.asarray( [[jaccard_dist(a,b) for _,b in users.iterrows()] for _,a in users.iterrows()])

In [None]:
pd.DataFrame(dist_matrix)

#### Perform the Agglomerative Clustering

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    plt.figure(figsize=(20,5))
    plt.title("Hierarchical Clustering Dendrogram")
    dendrogram(linkage_matrix, **kwargs)
    plt.xlabel("Number of points in node (or index of point if no parenthesis).")
    plt.show()

In [None]:
clustering = AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed',linkage='complete').fit(dist_matrix)

In [None]:
# plot the top three levels of the dendrogram
plot_dendrogram(clustering, truncate_mode="level", p=3)

In [None]:
agglomerative = AgglomerativeClustering(n_clusters=5, affinity='precomputed',linkage='complete').fit(dist_matrix)


In [None]:
users_cluster_agglomerative = pd.DataFrame.copy(users)
users_cluster_agglomerative['Cluster'] = agglomerative.labels_
users_cluster_agglomerative

In [None]:
clusterss = users.groupby(['CT','CU','LT','TC']).size().reset_index(name='Count')#.iloc[:,:-1]
clusterss

In [None]:
dist_matrix2 = np.asarray( [[jaccard_dist(a,b) for _,b in clusterss.iterrows()] for _,a in clusterss.iterrows()])

In [None]:
pd.DataFrame(dist_matrix2)