## Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dirty_completeness

## Useful Functions

In [None]:
def hists_plot(df):
    fig, axes = plt.subplots(1, len(df.columns), figsize=(20,5))
    ax = axes.flatten()
    for i, col in enumerate(df.columns):
        sns.countplot(df, x=col, ax=axes[i], order=sorted(df[col].value_counts().index))

In [None]:
def accuracy(df, df_imputed):
    accuracy = {}
    for col in df.columns:
        equal = np.where(df[col] == df_imputed[col], True, False)
        accuracy[col] = equal[equal == True].sum() / len(equal)
    return accuracy

## Read data

In [None]:
users = pd.read_csv('users.csv')

In [None]:
hists_plot(users)

## Shuffle data

In [None]:
users = users.sample(frac=1, axis=0).reset_index(drop=True)

## Inject null values

In [None]:
users_dirty = np.array(dirty_completeness.injection(df_pandas=users, seed=1234, name = 'users', name_class="none"))

In [None]:
users_dirty = users_dirty[0,:,:] #50% completeness

In [None]:
users_dirty = pd.DataFrame(users_dirty,columns=users.columns)

In [None]:
users_dirty

## Visualizing null values

In [None]:
sns.heatmap(users_dirty.isna())

## Simple Imputation

In [None]:
users_simple_imp = users_dirty.fillna(method='ffill')
users_simple_imp.isna().count()
users_simple_imp = users_simple_imp.fillna(method='bfill')

In [None]:
hists_plot(users)
#hists_plot(users_dirty)
hists_plot(users_simple_imp)

In [None]:
accuracy(users, users_simple_imp)

## Advanced Imputation

In [None]:
users_dirty_one_hot = pd.get_dummies(users_dirty)
for col in users_dirty.columns:
    users_dirty_one_hot.loc[users_dirty[col].isnull(), users_dirty_one_hot.columns.str.startswith(col)] = np.nan

In [None]:
from sklearn.impute import KNNImputer

In [None]:
knn_imputer = KNNImputer(n_neighbors=4)
users_knn_imp_one_hot = pd.DataFrame(knn_imputer.fit_transform(users_dirty_one_hot))
users_knn_imp_one_hot.columns = users_dirty_one_hot.columns

In [None]:
users_knn_imp = pd.DataFrame()
for col in users_dirty.columns:
   users_knn_imp[col] = users_knn_imp_one_hot.loc[:, users_knn_imp_one_hot.columns.str.startswith(col)].idxmax(1)
users_knn_imp = users_knn_imp.apply(lambda e: e.str[3:] )

In [None]:
hists_plot(users)
hists_plot(users_knn_imp)

In [None]:
accuracy(users,users_knn_imp)

## Clustering

Because of the categorical nature of the dataset, KModes is performed

In [None]:
%pip install kmodes

In [None]:
from kmodes.kmodes import KModes

In [None]:
# Elbow curve to find optimal K
cost = []
K = range(1,15)
for num_clusters in list(K):
    kmode = KModes(n_clusters=num_clusters, init = "random", n_init = 5, verbose=1)
    kmode.fit_predict(users_knn_imp)
    cost.append(kmode.cost_)
    
plt.plot(K, cost, 'bx-')
plt.xlabel('No. of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()


According to the elbow plot we get the optimal number of cluster k=6

In [None]:
kmodes = KModes(n_jobs = -1, n_clusters = 6, init = 'random', random_state = 0)
kmodes.fit_predict(users_knn_imp)

In [None]:
users_knn_imp['Cluster'] = kmodes.labels_

In [None]:
users_knn_imp