Useful websites for a Data Scientist:

* Towards Data Science: https://towardsdatascience.com/
* Medium: https://medium.com/
* Stack Overflow: https://stackoverflow.com/
* Stack Exchange: https://stackexchange.com/
* Kaggle: https://www.kaggle.com/

### Import libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime

pd.set_option('display.max_columns', None)
# Extract today datetime including day, month, year, hour etc.
now = datetime.datetime.now()

from pandas_profiling import ProfileReport      # Install pandas_profiling and ipywidgets
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans

### Read dataset

In [None]:
df_initial = pd.read_csv('marketing_campaign.csv', sep='\t')

In [None]:
df.head()

In [None]:
# Generate report describing our whole dataset. Very time efficient, we are not occupying the notebook with other stuff
# It will be generated in the folder we work in
report = ProfileReport(df, title="Report.html")
report.to_file("Report.html")

### Data preprocessing

##### Datetime Columns

In [None]:
# Substraction between actual year and the column we're working with
df['Year_Birth'] = now.year - df['Year_Birth']
# Rename column
df.rename(columns={'Year_Birth':'Age'}, inplace=True)

# Convert the column to datetime
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])
# Extract year
df['Dt_Customer'] = df['Dt_Customer'].dt.year
# Do the substraction
df['Dt_Customer'] = now.year - df['Dt_Customer']
# Rename it
df.rename(columns={'Dt_Customer':'Relationship years'}, inplace=True)

##### Categorical Columns

In [None]:
# Decided to drop this column because most of the customers are having studies
df.drop(labels=['Education'], axis=1, inplace=True)

# Convert 'Marital_Status' to a binary column. We want only 2 categories -> Single and in a relationship
# Convert 'Married' category to 'Together' so we can apply np.where
df['Marital_Status'].replace({'Married':'Together'}, inplace=True)
df['Marital_Status'] = np.where(df['Marital_Status'] == 'Together', 1, 0)

There's nothing we can do about the numerical variables. They look fine in my opinion

##### Drop categorical columns

We drop them because we must not include  them in a clusterization model. At least not in K-Means.

In [None]:
# Create a list with all the columns from our dataframe
cols_list = list(df.columns)
# Create an empty list where we will append the categorical cols. In our situation there are only binary ones
categorical_cols = []

for i in cols_list:
    # If condition telling it that if the number of categories per column is lower or equal than 2 we append the name
    # of the column to categorical_cols list
    if len(df[i].value_counts()) <= 2:
        categorical_cols.append(i)

# Drop categorical columns
df.drop(labels=categorical_cols, axis=1, inplace=True)

# Set ID as index
df.set_index(keys='ID', inplace=True)

I also decided to drop the columns 'Kidhome' and 'Teenhome' since I consider them as being more categorical than numerical.

In [None]:
df.drop(labels=['Kidhome', 'Teenhome'], axis=1, inplace=True)

##### Check for missing values

In [None]:
# Check for NULL rows
# Create a separate dataframe containing nulls
df_nulls = df[df.isna().any(axis=1)]
df_nulls.head()

In [None]:
# We will drop them since it would be too much to fill the missing ones with the average value of the colum
df.dropna(inplace=True)
print(f"Number of rows containing NaN's: {df[df.isna().any(axis=1)].shape[0]}")

##### Correlation Matrix after we've cleand our dataframe

In [None]:
fig, ax = plt.subplots(figsize=(16,9))
corr = df.corr()
sns.heatmap(data=corr, annot=True, cmap="Greens", )
plt.show()

### Data scaling

In [None]:
# Create a scaled object
scaler = StandardScaler()
# Create the normalized dataframe
df_scaled = pd.DataFrame(data=scaler.fit_transform(df), columns=df.columns)

### Elbow method

In [None]:
# Create a list with numbers from 1 to 10
number_of_clusters = list(range(1, 11))
# SSE Metric list
sum_of_squared_distance = []

for cluster in number_of_clusters:
    kmeans = KMeans(n_clusters=cluster, random_state=42)
    kmeans.fit(df_scaled)
    sum_of_squared_distance.append(kmeans.inertia_)

In [None]:
sns.set_theme(style='whitegrid')
sns.set(rc={'figure.figsize':(13,7)})

plt.plot(number_of_clusters, sum_of_squared_distance, 'go--')
plt.xlabel('Number of Clusters', fontsize=13)
plt.ylabel('Within Cluster Sum of squares', fontsize=13)
plt.title('Elbow Curve to find optimum K', fontsize=15)
plt.grid(True)
plt.show()

### Silhouette Score

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(15,8))
for i in list(range(2, 8)):
    # Create KMeans instance for different number of clusters
    km = KMeans(n_clusters=i, random_state=42)
    q, mod = divmod(i, 2)
    # Create SilhouetteVisualizer instance with KMeans instance
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(df_scaled)

### Davies-Bouldin Score

In [None]:
number_of_clusters = list(range(2,11))
results  = {}

for i in number_of_clusters:
    kmeans = KMeans(n_clusters=i, random_state=42)
    labels = kmeans.fit_predict(df_scaled)
    db_index = davies_bouldin_score(df_scaled, labels)
    results.update({i: db_index})

In [None]:
sns.set_theme(style='whitegrid')
sns.set(rc={'figure.figsize':(13,7)})

plt.plot(list(results.keys()), list(results.values()), 'go--')
plt.xlabel('Number of Clusters', fontsize=13)
plt.ylabel('Davies-Bouldin Index', fontsize=13)
plt.title('Davies-Bouldin Score', fontsize=15)
plt.grid(True)
plt.show()

### Apply the algorithm

In [None]:
# Create a KMeans object having 5 clusters and being named kmeans
kmeans = KMeans(n_clusters=3, random_state=42).fit_predict(df_scaled)
# The below command has been used for visualization purpose only. We want to have a look at the created clusters and
# the frequency from each cluster
unique, counts = np.unique(kmeans, return_counts=True)
# If we have 2 iterables (e.g. lists or tuples) of equal length and need to create a dictionary, we have
# to use the zip() function
print(f"Created clusters - {dict(zip(unique, counts))}")

# Create a dataframe for kmeans object
clusters = pd.DataFrame({'Clusters' : kmeans})
df_clusters = pd.concat([df.reset_index(drop=True), clusters], axis=1)

In [None]:
# Check the average value for each column of the clusters
df_clusters.groupby(['Clusters']).mean()