In [None]:
import pandas as pd
import sys
import numpy as np
from glob import glob
import seaborn as sns
import matplotlib.pyplot as plt
import umap
import warnings
from numba.errors import NumbaPerformanceWarning
from mpl_toolkits import mplot3d

# 0. Load Data

In [None]:
df = pd.read_csv('Data/202001_android_data_cleansed.csv', index_col='Marketing_Cloud_Visitor_ID')
print('Dataset shape')
print(df.shape)
df = df.fillna(0)
df.head(2)

In [None]:
# distribution of users per number of non-zero actions
df['non_zero'] = df.ne(0).sum(axis=1)
plt.figure(figsize=(5, 5))
plt.title('Distribution of users per Engagement with respect to the selected actions \n (after removing users with no action)')
hist = df['non_zero'].hist(bins=120)
plt.show()

# 1. UMAP

In [None]:
n_neighbors = 600 #the smaller, the finer grain clusters (subject to noise)
n_epochs = 5000
min_dist = 0.0 #the smaller, the cleaner separation between clusters
n_components = 3 #dimension of the reduced space
metric = 'euclidean'
seed = 42
reducer = umap.UMAP(transform_seed=seed, n_components=n_components, n_epochs=n_epochs, n_neighbors=n_neighbors)

In [None]:
# silence NumbaPerformanceWarning
#warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
%time embedding = reducer.fit_transform(df_aux)

In [None]:
# final dataset
compressed_data = pd.DataFrame(data=embedding, index=list(df_aux.index), 
                               columns=['umap_dim_1', 'umap_dim_2', 'umap_dim_3'])

# 2. Visualization

In [None]:
compressed_data = pd.read_csv('Data/202001_data_compressed1.csv')
compressed_data = compressed_data.rename(columns={'Unnamed: 0' : 'ID'})

In [None]:
n_bins=10
x = compressed_data.drop(columns={'ID'}).transpose()
colors = ['red', 'blue', 'lime']
plt.figure(figsize=(5, 5))
plt.hist(x, n_bins, histtype='step', stacked=True, fill=False, color=colors, label=x.index)
plt.title('Distribution of 3 dimensions')
plt.legend()
fig.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
ax = plt.axes(projection='3d')
ax.scatter3D(compressed_data['umap_dim_3'], 
             compressed_data['umap_dim_2'], 
             compressed_data['umap_dim_1'], 
             c='lime', s=15, alpha=0.002)
plt.title('Users compressed on a 3D space')
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
ax = plt.axes(projection='3d')
ax.scatter3D(compressed_data['umap_dim_2'], 
             compressed_data['umap_dim_1'], 
             compressed_data['umap_dim_3'], 
             c='blue', s=20, alpha=0.002)
plt.show()