In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
data = pd.read_csv("data/online_shoppers_intention.csv")

Ramka składa się z następujących kolumn:

In [None]:
data.info()

Zmienne: "Administrative", "Administrative Duration", "Informational", "Informational Duration", "Product Related", "Product Related Duration", "Bounce Rate", "Exit Rate", "Page Value" oraz "Special Day" to zmienne numeryczne.

Natomiast "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend" oraz "Revenue" to zmienne kategoryczne.

In [None]:
num_vars = ["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]
cat_vars = ["OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend","Revenue"]

In [None]:
from sklearn.preprocessing import FunctionTransformer


transformer = FunctionTransformer(np.log1p)

transformer.transform(data[num_vars]).hist(bins=30, figsize=(21,14))
plt.show()

In [None]:
n, bins, patches = plt.hist(x=transformer.transform(data[num_vars])["PageValues"], bins='auto', 
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.title("Page Values")
plt.ylim(top=1000)
plt.show()

In [None]:
n, bins, patches = plt.hist(x=transformer.transform(data[num_vars])["BounceRates"], bins=30, 
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.title("Bounce Rates")
plt.ylim(top=1000)
plt.show()

In [None]:
n, bins, patches = plt.hist(x=transformer.transform(data[num_vars])["Informational"], bins='auto', 
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.title("Informational")
plt.ylim(top=1500)
plt.show()

In [None]:
n, bins, patches = plt.hist(x=transformer.transform(data[num_vars])["Informational_Duration"], bins='auto', 
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.title("Informational Duration")
plt.ylim(top=1000)
plt.show()

In [None]:
num_vars_with_rev = num_vars + ["Revenue"]

sns.pairplot(data[num_vars_with_rev].drop("SpecialDay", axis=1), hue="Revenue")
plt.show()

In [None]:
corr = data[num_vars].corr()
f, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.show()

In [None]:
from sklearn.metrics import normalized_mutual_info_score
mi = []
cat_data = data[cat_vars]


for i in range(len(cat_vars)):
    temp = []
    for j in range(len(cat_vars)):
        temp.append(normalized_mutual_info_score(cat_data.iloc[:,i], cat_data.iloc[:,j]))
    mi.append(temp)

midf = pd.DataFrame(mi, columns=cat_vars, index=cat_vars)

f, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(
    midf, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(data = data, x = "VisitorType")
plt.xlabel("Visitor type")
plt.ylabel("Number")
plt.show()

In [None]:
plt.figure(figsize=(20,20))
data.groupby("OperatingSystems")["Revenue"].value_counts().unstack().plot(kind="bar")
plt.title("Revenue by OS")
plt.show()

percentage = data.groupby("OperatingSystems")["Revenue"].value_counts().unstack()
percentage["percentage"] = percentage[True] / (percentage[True] + percentage[False] )
percentage

In [None]:
plt.figure(figsize = (10,10))
data.groupby("Browser")["Revenue"].value_counts().unstack().plot(kind="bar")
plt.title("Revenue by browser")
plt.show()

percentage = data.groupby("Browser")["Revenue"].value_counts().unstack()

percentage["percentage"] = percentage[True] / (percentage[True] + percentage[False] )
percentage

In [None]:
istrue = data["Revenue"]== True

fig, (ax1, ax2) = plt.subplots(1,2, sharey=True, figsize=(12,9))

ax1.hist(data.loc[istrue, "BounceRates"])
ax1.set_title("Users generating revenue")
ax2.hist(data.loc[istrue==False, "BounceRates"])
ax2.set_title("Users not generating revenue")
plt.show()

In [None]:
plt.figure(figsize=(12,9))
sns.boxplot(x=data["Revenue"], y=data["BounceRates"])
plt.ylim([-0.001,0.06])

In [None]:
plt.figure(figsize=(12,9))
sns.boxplot(x=data["Revenue"], y=data["ProductRelated_Duration"])
plt.ylim([0,6000])
plt.show()

In [None]:
def count_clustering_scores(X, cluster_num, model, score_fun):
    if isinstance(cluster_num, int):
        cluster_num_iter = [cluster_num]
    else:
        cluster_num_iter = cluster_num
        
    scores = []    
    for k in cluster_num_iter:
        model_instance = model(n_clusters=k)
        labels = model_instance.fit_predict(X)
        wcss = score_fun(X, labels)
        scores.append(wcss)
    
    if isinstance(cluster_num, int):
        return scores[0]
    else:
        return scores

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

cluster_num_seq = range(2, 20)
silhouette_vec = count_clustering_scores(data.drop(["Month", "VisitorType"], axis=1), cluster_num_seq, KMeans, silhouette_score)
plt.plot(cluster_num_seq, silhouette_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.show()

In [None]:
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans

cluster_num_seq = range(2, 20)
davies_vec = count_clustering_scores(data.drop(["Month", "VisitorType"], axis=1), cluster_num_seq, KMeans, davies_bouldin_score)
plt.plot(cluster_num_seq, davies_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('davies-bouldin score')
plt.show()

In [None]:
from sklearn.manifold import TSNE


tsne = TSNE(perplexity = 60)

X_tsne = pd.DataFrame(tsne.fit_transform(data.drop(["Month", "VisitorType"], axis=1)), columns=["tsne1", "tsne2"])

X_tsne.head()

In [None]:
km = KMeans(n_clusters=11)

km.fit(data.drop(["Month", "VisitorType"], axis=1))

predictions = km.labels_

sns.scatterplot(x="tsne1", y="tsne2", data=X_tsne, hue=predictions, palette=sns.color_palette("muted", n_colors=11))

In [None]:
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters=11)

agg.fit(data.drop(["Month", "VisitorType"], axis=1))


predictions = agg.labels_

sns.scatterplot(x="tsne1", y="tsne2", data=X_tsne, hue=predictions, palette=sns.color_palette("muted", n_colors=11))