# RUN PERFORMANCE PROJECT - Pau Sampietro

## Introductory application of unsupervised learning algorithms (Clustering moves)

_EXPLANATION_

In [1]:
import pandas as pd
import seaborn as sns
import plotly.plotly as py
import matplotlib.pyplot as plt
import cufflinks as cf 
from ipywidgets import interact
import plotly.graph_objs as go
from rpdb import read_table, read_table_sql, export_table

### Importing data from tables stored in the DB

To work on this part, we only import the important and not redundant information of the records table.

In [None]:
Ssql = """SELECT idmove, idrecord, enhanced_altitude, vertical_speed, records.heart_rate, records.pace, athlete
         FROM records
         INNER JOIN moves ON move = idmove"""

# Function created for this purpose, that imports data with specific select query 
records = read_table_sql('records', Ssql)

In [None]:
records.head()

### 1. Data Preprocessing

One problem with the dataset is the value ranges are remarkably different across various categories (e.g. heart rate compared to pace and vertical_speed). Diverse value ranges in different features could cause issues in our clustering. The way to reduce the problem is through **feature scaling**. We'll use this technique again with this dataset.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
records_feat = records[['enhanced_altitude', 'vertical_speed', 'heart_rate', 'pace']]
records_sc = scaler.fit_transform(records_feat)

In [None]:
pd.DataFrame(records_sc).head()

### 2. Data Clustering with K-Means algorithm

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Instantiate the model and fit records features once they have been scaled
km_model = KMeans(n_clusters=4)
km_fit = km_model.fit(X=records_sc)

# We add a column with the labels provided by the algorithm
records['kmeans_cluster'] = km_model.labels_

In [None]:
records['kmeans_cluster'].value_counts()

### 3. Data Clustering with DBSCAN algorithm

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
# Instantiate the model and fit records features once they have been scaled
dbscan_model = DBSCAN(eps=0.5,min_samples=6)
dbscan_model.fit(records_sc)

# We add a column with the labels provided by the algorithm
pref_df['dbscan_cluster'] = dbscan_model.labels_

In [None]:
records['kmeans_cluster'].value_counts()

### 4. Visualization of Clusters and performing some Metrics

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
km_sc = silhouette_score(records,labels=records['kmeans_cluster'])
db_sc = silhouette_score(records,labels=records['dbscan_cluster'])

print(f'Score for K-Means Clustering: Silouhette score = {km_sc}')
print(f'Score for DBSCAN Clustering: Silouhette score = {db_sc}')

In [None]:
sns.scatterplot(x='pace',y='vertical_speed',data=records.sample(1e5) ,hue='kmeans_cluster',palette="coolwarm")