In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from yellowbrick.cluster import KElbowVisualizer

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/bank-marketing-dataset/bank.csv')
df.head()

In [None]:
df = df.iloc[:,:6]
df.drop("default", axis=1,inplace=True)
df = df[df['balance'] > 0]
df

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.isna().sum()

In [None]:
print("The total number of data-points after removing the rows with duplicated values are:", len(df))

In [None]:
print("Total categories in the feature Job:\n", df["job"].value_counts(), "\n")
print("Total categories in the feature Education:\n", df["education"].value_counts())

In [None]:
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables in the dataset:", object_cols)

In [None]:
df.describe().T

In [None]:
cont_features = []
cat_features = []

for c in df.columns:
    if df[c].dtype == 'int64':
        cont_features += [c]
    else:
        cat_features += [c]

In [None]:
df[cat_features].nunique()

In [None]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[cat_features]))
OH_cols.index = df.index

df_cont = df.drop(cat_features, axis=1)

df_OH = pd.concat([df_cont, OH_cols], axis=1)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8,8))
sns.distplot(df_OH['age'], ax=ax1)
sns.distplot(df_OH['balance'], ax=ax2)

In [None]:
df_OH[cont_features] = np.log(df_OH[cont_features])
scaler = StandardScaler()
df_OH[cont_features] = scaler.fit_transform(df_OH[cont_features])
df[cont_features] = scaler.fit_transform(df[cont_features])

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8,8))
sns.distplot(df_OH['age'], ax=ax1)
sns.distplot(df_OH['balance'], ax=ax2)

In [None]:
Elbow_M = KElbowVisualizer(KMeans(), k=10)
Elbow_M.fit(df[cont_features])
Elbow_M.show()

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=0, n_init=30, max_iter=100)
clusters = kmeans.fit_predict(df[cont_features])
df['cluster'] = clusters
sns.relplot(x='age', y='balance', hue='cluster', data=df)

In [None]:
df = pd.read_csv('../input/bank-marketing-dataset/bank.csv')
df = df.iloc[:,:6]
df = df.drop(columns='default')
df = df[df['balance'] > 0]
df = df.drop_duplicates()
df['cluster'] = clusters
groups = df.groupby(['cluster']).agg(['min', 'median', 'mean', 'max', 'sum', 'count']).round()
groups['pct_total'] = (groups['balance']['sum'] / groups['balance']['sum'].sum()).round(3)*100
groups

In [None]:
Elbow_M = KElbowVisualizer(KMeans(), k=10)
Elbow_M.fit(df_OH)
Elbow_M.show()

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', n_init=30, max_iter=100, random_state=0)
clusters = kmeans.fit_predict(df_OH)
df_OH['cluster'] = clusters
sns.relplot(x='age', y='balance', hue='cluster', data=df_OH)

In [None]:
df = pd.read_csv('../input/bank-marketing-dataset/bank.csv')
df = df.iloc[:,:6]
df = df.drop(columns='default')
df = df[df['balance'] > 0]
df = df.drop_duplicates()
df['cluster'] = clusters
groups = df.groupby(['cluster', 'job', 'marital', 'education']).agg(['median', 'sum', 'count']).round()
groups['pct_total'] = (groups['balance']['sum'] / groups['balance']['sum'].sum()).round(3)*100

In [None]:
top = groups.sort_values(by='pct_total', ascending=False)
top.head(60)