# **This file is used for clustering of existing cluster data** _(i.e. "secondary clustering")_


In [None]:
#@markdown **Contents**: Contents. <br>
#@markdown ***
#@markdown Major Edit History: 
#@markdown - Luke Swanson, 2/20/2020: Created based on code in other clustering files.
#@markdown (`Lakeland Clustering Hypothesis #4`, `ClusterStats`).
#@markdown ***
#@markdown <br> 
#@markdown Please change `FIELDDAY_DIR` if it is located differently in your drive. This cell will error if `FIELDDAY_DIR` is incorrect.
import ipywidgets as widgets
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from io import BytesIO
from math import pi
from math import ceil
from scipy import stats
from zipfile import ZipFile
# mount drive
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

# Change working directory
import os
FIELDDAY_DIR = '/content/drive/My Drive/Field Day' #@param {type:"string"}
JUPYTER_DIR = os.path.join(FIELDDAY_DIR,'Research and Writing Projects/2020 CHI Play - Lakeland Clustering/Jupyter')
os.chdir(JUPYTER_DIR)
print(f'---\nCWD: {os.getcwd()}')

#@markdown Change pandas `max_rows` and `max_columns`
pd.options.display.max_columns = 100 #@param {type:"integer"}
pd.options.display.max_rows = 60 #@param {type:"integer"}

#@markdown *Note: There may be other variables to manually change. Look the "Set Variables" section.*

# import utils
import sys
sys.path.append('.')
import utils
#import cluster_workflow
#from Notebooks.Clustering.cluster_workflow import Workflow
import Notebooks.Clustering.cluster_utils as cu
import Notebooks.Clustering.cluster_workflow as cw
import importlib
importlib.reload(cu)
importlib.reload(cw)

workflow = cw.Workflow()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
---
CWD: /content/drive/My Drive/Field Day/Research and Writing Projects/2020 CHI Play - Lakeland Clustering/Jupyter


In [None]:
game_path = f"{JUPYTER_DIR}/Results/"
game_folders = [fdir for fdir in os.listdir(game_path) if os.path.isdir(os.path.join(game_path, fdir))]
game_folder_selector = widgets.Select(
    options = game_folders,
    value = game_folders[0],
    description = "game Folder",
    layout = widgets.Layout(width='60%')
)
file_path = game_path + game_folder_selector.value + "/best.txt"
print(f"File path: {file_path}")

def updateGameFolder(change):
  file_path = game_path + game_folder_selector.value + "/best.txt"
  print(f"File path: {file_path}")
game_folder_selector.observe(updateGameFolder, names="value")

display(game_folder_selector)

File path: /content/drive/My Drive/Field Day/Research and Writing Projects/2020 CHI Play - Lakeland Clustering/Jupyter/Results/Lakeland/best.txt


Select(description='game Folder', layout=Layout(width='60%'), options=('Lakeland', 'Waves', 'Crystal'), value=…

In [None]:
categories = []
colors = {i:v for i,v in enumerate(plt.cm.get_cmap('tab10').colors)}

In [None]:
best_file = open(file_path)
text = best_file.read()
print(f"text: {str(text)}")
best_file.close()
paths = json.loads(str(text))

full_file_paths = {subpath: f"{game_path}{game_folder_selector.value}/{paths[subpath]}/clusters.csv" for subpath in paths.keys()}

actions_path = full_file_paths["Actions"]
init = pd.read_csv(actions_path, index_col=["sessID", "num_play"])
df = pd.DataFrame()
df["Cluster_Actions"] = init['label']

feedback_path = full_file_paths["Feedback"]
init = pd.read_csv(feedback_path, index_col=["sessID", "num_play"])
just_labels = pd.DataFrame()
just_labels["Cluster_Feedback"] = init['label']
just_labels.rename(columns={'label': "Cluster_Feeback"})
df = df.join(just_labels, how="inner", rsuffix="_Feedback")

achievement_path = full_file_paths["Achievements"]
init = pd.read_csv(achievement_path, index_col=["sessID", "num_play"])
just_labels = pd.DataFrame()
just_labels["Cluster_Achievements"] = init['label']
just_labels.rename(columns={'label': "Cluster_Achievements"})
df = df.join(just_labels, how="inner", rsuffix="_Achievements")

print(f"df columns: {df.columns}")

print(df.shape)
df.head()

df.to_csv(path_or_buf=f"{game_path}{game_folder_selector.value}/composite_clusters.csv")

text: {
"Actions": "player_lvl0_logtransform/z3pca2k6",
"Feedback": "feedback_lv01_logtransform/z3pca2k7",
"Achievements": "achs_achs_per_sess_second_sessDur_logtransform/z3pca2k6"
}

df columns: Index(['Cluster_Actions', 'Cluster_Feedback', 'Cluster_Achievements'], dtype='object')
(5510, 3)


## Filtering
- Filtered out the sessions that used SPYPARTY (debug=1)
- List filters here

In [None]:
if ('debug' in df.keys()):
  df = df[df['debug'] < 1]

## Choose and Explore Features



In [None]:
# modify/add/remove features as desired

In [None]:
list(df.keys())

In [None]:
feature_names = [
  'clustering_DecJan_sess_feedback_pca2k4_20200218.csv',
  'clustering_DecJan_sess_achs_pca2k4_20200212.csv'
]
df2 = df.loc[:,feature_names].fillna(0)
df2

In [None]:
df2.describe()

In [None]:
workflow.Histogram(df2, num_bins=4, log_scale=False)

In [None]:
df2["concat"] = 10*df2['clustering_DecJan_sess_feedback_pca2k4_20200218.csv'] + df2['clustering_DecJan_sess_achs_pca2k4_20200212.csv']
df2

In [None]:
workflow.Histogram(pd.DataFrame(df2["concat"]), num_bins=33, log_scale=False)

In [None]:
normed,dummy = workflow.Normalized(df2)
scaled_normed,dummy = workflow.Scaled(df=normed, scaling_method="Robust")
workflow.Histogram(scaled_normed, num_bins=4, log_scale=False)

In [None]:
workflow.Correlations(df=df2, heat_range=0.5)

## Explore Eigenvalues  

In [None]:
# import numpy as np
# U,S,V = np.linalg.svd(X)
# eigvals = S**2 / np.sum(S**2)
# fig = plt.figure(figsize=(8,5))
# sing_vals = np.arange(X.shape[1]) + 1
# plt.plot(sing_vals, eigvals, 'ro-', linewidth=2)
# plt.title('Scree Plot')
# plt.xlabel('Principal Component')
# plt.ylabel('Eigenvalue')
workflow.Scree(df2)

## Explore K-Means Error

In [None]:
# Run the Kmeans algorithm and get the index of data points clusters
sse = []
list_k = list(range(1, 10))
#label_data = df2['label']

for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(X)
    sse.append(km.inertia_)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.suptitle(f'K-means with no PCA',
                 fontsize=16, fontweight='semibold', y=1.05);
plt.xlabel('Number of clusters (K)')
plt.ylabel('Sum of squared distance');


for i, components in enumerate([2, 1]):
    # Project using PCA
    projected = PCA(components).fit_transform(X)
    sse = []
    list_k = list(range(1, 15))

    for k in list_k:
        km = KMeans(n_clusters = k)
        km.fit(projected)
        sse.append(km.inertia_)
    
    # Plot sse against k
    plt.figure(figsize=(6, 6))
    plt.plot(list_k, sse, '-o')
    plt.suptitle(f'K-means using PCA with {components} components',
                 fontsize=16, fontweight='semibold', y=1.05);
    plt.xlabel('Number of K-Means clusters')
    plt.ylabel('Sum of squared distance')


In [None]:
from sklearn.metrics import silhouette_samples
from sklearn.decomposition import FactorAnalysis

components = 2;
projected = PCA(components).fit_transform(X)
#projected = FactorAnalysis(n_components=components, random_state=0).fit_transform(X)
projected = X

color_list = ['r','g','b','c','m','y','k']
for i, k in enumerate([2, 3, 4, 5, 6, 7]):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    
    # Run the Kmeans algorithm
    km = KMeans(n_clusters=k)
    labels = km.fit_predict(projected)
    centroids = km.cluster_centers_

    # Get silhouette samples
    silhouette_vals = silhouette_samples(projected, labels)

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, color=color_list[i], edgecolor='none', height=1)
        ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    # Get the average silhouette score and plot it
    avg_score = np.mean(silhouette_vals)
    ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
    ax1.set_yticks([])
    ax1.set_xlim([-0.1, 1])
    ax1.set_xlabel('Silhouette coefficient values')
    ax1.set_ylabel('Cluster labels')
    ax1.set_title('Silhouette plot for the various clusters', y=1.02);
    
    # Scatter plot of data colored with labels
    ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c=color_list[:i+1], edgecolors='k', s=250)
    ax2.scatter(X[:, 0], projected[:, 1], c=[color_list[label] for label in labels])
    ax2.set_xlim([-2, 2])
    ax2.set_xlim([-2, 2])
    ax2.set_xlabel('PCA 1')
    ax2.set_ylabel('PCA 2')
    ax2.set_title('Visualization of clustered data', y=1.02)
    ax2.set_aspect('equal')
    
    plt.tight_layout()
    plt.suptitle(f'Silhouette analysis PCA = {components} and k-Means = {k}: Avg. silhouette = {avg_score:.2f}',
                 fontsize=16, fontweight='semibold', y=1.05);

In [None]:
from sklearn.metrics import silhouette_samples

components = 3;
#projected = X
projected = PCA(components).fit_transform(X)

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)

# Run the DBSCAN algorithm
db = DBSCAN(eps=0.3, min_samples=10)
labels = db.fit_predict(projected)


# Get silhouette samples
silhouette_vals = silhouette_samples(projected, labels)

# Silhouette plot
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
    cluster_silhouette_vals = silhouette_vals[labels == cluster]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
    ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
    y_lower += len(cluster_silhouette_vals)

# Get the average silhouette score and plot it
avg_score = np.mean(silhouette_vals)
ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
ax1.set_yticks([])
ax1.set_xlim([-0.1, 1])
ax1.set_xlabel('Silhouette coefficient values')
ax1.set_ylabel('Cluster labels')
ax1.set_title('Silhouette plot for the various clusters', y=1.02);

# Scatter plot of data colored with labels
ax2.scatter(X[:, 0], projected[:, 1], c=labels)
#ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
ax2.set_xlim([-2, 2])
ax2.set_xlim([-2, 2])
ax2.set_xlabel('PCA 1')
ax2.set_ylabel('PCA 2')
ax2.set_title('Visualization of clustered data', y=1.02)
ax2.set_aspect('equal')

plt.tight_layout()
plt.suptitle(f'Silhouette analysis PCA = {components} and DBSCAN',
              fontsize=16, fontweight='semibold', y=1.05);


###Plot the PCA and K-Means

In [None]:
from mpl_toolkits.mplot3d import Axes3D
plt.figure(figsize=(20,20))
ax = plt.axes(projection='3d')
pca = PCA(n_components=3)
projected = pca.fit_transform(X)
kmeans = KMeans(4).fit(projected)
intent_labels=kmeans.labels_
ax.scatter3D(projected[:,0], projected[:,1], projected[:,2], c=kmeans.labels_);

Begin using Factor Analysis

In [None]:
from sklearn.decomposition import FactorAnalysis
from mpl_toolkits.mplot3d import Axes3D
plt.figure(figsize=(20,20))
ax = plt.axes(projection='3d')
transformer = FactorAnalysis(n_components=3, random_state=0)
projected = transformer.fit_transform(X)
kmeans = KMeans(4).fit(projected)
intent_labels=kmeans.labels_
ax.scatter3D(projected[:,0], projected[:,1], projected[:,2], c=kmeans.labels_);