# Projekt 2 - EDA
**Mikołaj Spytek, Artur Żółkowski**

W tym projekcie zajmujemy się klasteryzacją danych dotyczących aktywności użytkowników sklepu internetowego.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score, adjusted_mutual_info_score, normalized_mutual_info_score
from sklearn.manifold import TSNE
import sklearn
import seaborn as sns
from sklearn.cluster import KMeans

import random
random.seed(42)

In [None]:
data = pd.read_csv("data/online_shoppers_intention.csv")

### Przygotowanie danych

In [None]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [None]:
months = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "June": 6, 
          "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12}
data["Month"] = data["Month"].map(months)

In [None]:
data = encode(data, 'Month', 12)

In [None]:
ax = data.plot.scatter('Month_sin', 'Month_cos').set_aspect('equal')

In [None]:
num_vars = ["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", 
            "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay", "Month_sin", "Month_cos"]
cat_vars = ["OperatingSystems", "Browser", "Region", "VisitorType", "Weekend", "TrafficType"]
log_vars = ['Administrative', 'Administrative_Duration', 'Informational',
                'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
                'BounceRates', 'ExitRates', 'PageValues']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OrdinalEncoder

scaler=StandardScaler()

preprocessor = ColumnTransformer(
    transformers= [
        ('log', FunctionTransformer(np.log1p), log_vars),
        ('cat', OrdinalEncoder(), cat_vars)
    ],
    remainder = 'passthrough'
)
transformed_data = preprocessor.fit_transform(data.drop(['Month', 'Revenue'], axis=1))

In [None]:
transformed_data = scaler.fit_transform(transformed_data)
transformed_data = pd.DataFrame(transformed_data, columns = data.drop(['Month', 'Revenue'], axis=1).columns)

### Klastrowania

In [None]:
def count_clustering_scores(X, cluster_num, model, score_fun):
    if isinstance(cluster_num, int):
        cluster_num_iter = [cluster_num]
    else:
        cluster_num_iter = cluster_num
        
    scores = []    
    for k in cluster_num_iter:
        model_instance = model(n_clusters=k)
        labels = model_instance.fit_predict(X)
        wcss = score_fun(X, labels)
        scores.append(wcss)
    
    if isinstance(cluster_num, int):
        return scores[0]
    else:
        return scores

In [None]:
cluster_num_seq = range(2, 20)
davies_vec = count_clustering_scores(transformed_data, cluster_num_seq, KMeans, davies_bouldin_score)
plt.figure(figsize=(12,8))
plt.plot(cluster_num_seq, davies_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('davies-bouldin score')
plt.show()

In [None]:
cluster_num_seq = range(2, 20)
silhouette_vec = count_clustering_scores(transformed_data, cluster_num_seq, KMeans, silhouette_score)
plt.figure(figsize=(12,8))
plt.plot(cluster_num_seq, silhouette_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.show()

#### Pierwszy przykładowy model

In [None]:
model_km = KMeans(n_clusters = 12, random_state = 42)
labels_km = model_km.fit_predict(transformed_data)

In [None]:
transformed_data["cluster"] = labels_km
data["cluster"] = labels_km

In [None]:
tSNE = TSNE(learning_rate = 300, random_state = 42, verbose = 1)

In [None]:
tSNE_td = tSNE.fit_transform(transformed_data)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = tSNE_td[:,0],
                y = tSNE_td[:,1], 
                hue = labels_km,
                style = data["Revenue"],
                alpha=0.5,
                palette=sns.color_palette("hls", 12), 
                legend=True)
plt.show()

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(14, 14))
for i, feature in enumerate(num_vars):
    m, n = divmod(i, 3)
    sns.boxplot(x="cluster", y=feature, data=data, ax = ax[m, n])
plt.tight_layout()
plt.show()

In [None]:
results = data.groupby("cluster").agg(['sum', 'count'])
results["Revenue"]

#### Porównaniue wyników różnych modeli

In [None]:
algorithms = {
    "KMeans": KMeans(random_state=42),
    "Agglomerative - ward linkage": AgglomerativeClustering(linkage="ward"),
    "Agglomerative - single linkage": AgglomerativeClustering(linkage="single"),
    "GMM - spherical covariance": GaussianMixture(covariance_type = "spherical", random_state = 42)
}

# scores = {
#     "Silhouette": silhouette_score(),
#     "Calinski_Harabasz": calinski_harabasz_score(),
#     "Davies_Bouldin": davies_bouldin_score()
# }


silhouette_scores = pd.DataFrame()
calinski_harabasz_scores = pd.DataFrame()
davies_bouldin_scores = pd.DataFrame()
stability_scores= pd.DataFrame()
indices = [k for k in range(len(transformed_data))]


for i in range (2, 13):
    for name in algorithms:
        model = algorithms[name]
        if "KMeans" in name or "Agglomerative" in name:
            model.n_clusters = i
        else:
            model.n_components = i
        labels = model.fit_predict(transformed_data)
        silhouette_scores.loc[name, i] = silhouette_score(transformed_data, labels)
        calinski_harabasz_scores.loc[name, i] = calinski_harabasz_score(transformed_data, labels)
        davies_bouldin_scores.loc[name, i] = davies_bouldin_score(transformed_data, labels)
        stability = []
        for j in range(5):
            resampled = sklearn.utils.resample(indices)
            resampled_pred = model.fit_predict(transformed_data.loc[resampled])
            stability.append(normalized_mutual_info_score(labels[resampled], resampled_pred))
        stability_scores.loc[name,i]  = np.mean(stability)
        print("Doing {} with {} clusters".format(name, i))
        

In [None]:
silhouette_scores

In [None]:
sns.heatmap(silhouette_scores)

In [None]:
calinski_harabasz_scores

In [None]:
sns.heatmap(calinski_harabasz_scores)

In [None]:
davies_bouldin_scores

In [None]:
sns.heatmap(davies_bouldin_scores)

In [None]:
stability_scores

In [None]:
sns.heatmap(stability_scores)

In [None]:
transformed_data.shape

In [None]:
minPts = 38
nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=minPts).fit(transformed_data)
distances, indices = nbrs.kneighbors(transformed_data)
distanceDec = sorted(distances[:,minPts-1], reverse=True)
fig = plt.figure(figsize=(9,6))
ax1 = fig.add_subplot()

plt.xlabel('Indeks punktu po sortowaniu')
plt.ylabel('Dystans od 37 najbliższego sąsiada')
ax1.plot(list(range(1,transformed_data.shape[0]+1)), distanceDec)
plt.xscale('log')
plt.grid(axis='y')

plt.show()

In [None]:
db = DBSCAN(eps=4.2, min_samples=38)

db_labels = db.fit_predict(transformed_data)

In [None]:
set(db_labels)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = tSNE_td[:,0],
                y = tSNE_td[:,1], 
                hue = db_labels,
                alpha=0.5,
                palette=sns.color_palette("Set2", 3), 
                legend=True)
plt.show()

In [None]:
plt.hist(db_labels)

In [None]:
db = DBSCAN(eps=2.2, min_samples=38)

db_labels = db.fit_predict(transformed_data)

In [None]:
set(db_labels)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = tSNE_td[:,0],
                y = tSNE_td[:,1], 
                hue = db_labels,
                alpha=0.5,
                palette=sns.color_palette("hls", 14), 
                legend=True)
plt.show()

In [None]:
plt.hist(db_labels)

#### Analiza wybranego modelu

In [None]:
km = KMeans(n_clusters=5, random_state=42)


labels = km.fit_predict(transformed_data)


transformed_data["cluster"] = labels
data["cluster"] = labels

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = tSNE_td[:,0],
                y = tSNE_td[:,1], 
                hue = labels,
                style = data["Revenue"],
                alpha=0.5,
                palette=sns.color_palette("hls", 5), 
                legend=True)
plt.show()

In [None]:
plt.hist(labels)

In [None]:
results = data.groupby("cluster").agg(['sum', 'count'])
results["Revenue"]

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(14, 14))
for i, feature in enumerate(num_vars):
    m, n = divmod(i, 3)
    sns.boxplot(x="cluster", y=feature, data=data, ax = ax[m, n], palette=sns.color_palette("hls", 5))
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x="VisitorType", hue="cluster", data=data, palette=sns.color_palette("hls", 5))
plt.show()

In [None]:
sns.countplot(x="Revenue", hue="cluster", data=data, palette=sns.color_palette("hls", 5))
plt.show()

In [None]:
from scipy.spatial import distance

def min_interclust_dist(X, label):
    clusters = set(label)
    global_min_dist = np.inf
    for cluster_i in clusters:
        cluster_i_idx = np.where(label == cluster_i)
        for cluster_j in clusters:
            if cluster_i != cluster_j:
                cluster_j_idx = np.where(label == cluster_j)
                interclust_min_dist = np.min(distance.cdist(X[cluster_i_idx], X[cluster_j_idx]))
                global_min_dist = np.min([global_min_dist, interclust_min_dist])
    return global_min_dist

def _inclust_mean_dists(X, label):
    clusters = set(label)
    inclust_dist_list = []
    for cluster_i in clusters:
        cluster_i_idx = np.where(label == cluster_i)
        inclust_dist = np.mean(distance.pdist(X[cluster_i_idx]))
        inclust_dist_list.append(inclust_dist)
    return inclust_dist_list

def mean_inclust_dist(X, label):
    inclust_dist_list = _inclust_mean_dists(X, label)
    return np.mean(inclust_dist_list)

def std_dev_of_inclust_dist(X, label):
    inclust_dist_list = _inclust_mean_dists(X, label)
    return np.std(inclust_dist_list)

def mean_dist_to_center(X, label):
    clusters = set(label)
    inclust_dist_list = []
    for cluster_i in clusters:
        cluster_i_idx = np.where(label == cluster_i)
        cluster_i_mean = np.mean(X[cluster_i_idx], axis=0, keepdims=True)
        inclust_dist = np.mean(distance.cdist(X[cluster_i_idx], cluster_i_mean))
        inclust_dist_list.append(inclust_dist)
    return np.mean(inclust_dist_list)

In [None]:
min_interclust_dist(transformed_data.to_numpy(), labels)

In [None]:
mean_inclust_dist(transformed_data.to_numpy(), labels)

In [None]:
std_dev_of_inclust_dist(transformed_data.to_numpy(), labels)

In [None]:
mean_dist_to_center(transformed_data.to_numpy(), labels)

### 9 klastrów - bonus

In [None]:
km = KMeans(n_clusters=9, random_state=42)


labels = km.fit_predict(transformed_data)


transformed_data["cluster"] = labels
data["cluster"] = labels

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = tSNE_td[:,0],
                y = tSNE_td[:,1], 
                hue = labels,
                style = data["Revenue"],
                alpha=0.5,
                palette=sns.color_palette("hls", 9), 
                legend=True)
plt.show()

In [None]:
plt.hist(labels)

In [None]:
results = data.groupby("cluster").agg(['sum', 'count'])
results["Revenue"]

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(14, 14))
for i, feature in enumerate(num_vars):
    m, n = divmod(i, 3)
    sns.boxplot(x="cluster", y=feature, data=data, ax = ax[m, n], palette=sns.color_palette("hls", 9))
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x="VisitorType", hue="cluster", data=data, palette=sns.color_palette("hls", 9))
plt.show()

In [None]:
sns.countplot(x="Revenue", hue="cluster", data=data, palette=sns.color_palette("hls", 9))
plt.show()