In [1]:
'''General Libraries'''
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', 500)

'''Statistic'''
import scipy
from scipy import stats
from scipy.stats import  norm
import datetime

'''Scikit Learn'''
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

'''Ploting Libraries'''
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = "notebook" 


import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set_palette('RdBu')

'''Miscellaneous'''
from pandas_profiling import ProfileReport

'''Seeds'''
import random
random.seed(10)
np.random.seed(11)


# Data Overview

In [2]:
df = pd.read_csv('data_clean_noOutliers.csv')

In [3]:
df.head(8)

Unnamed: 0,name,5k,10k,20k,half,25k,30k,35k,40k,official,pace,rank_overall,rank_in_gender_only,rank_in_division,age,gender,country,state,city,bib
0,"Yamamoto, Hiroyuki",8.02,17.37,37.65,39.72,47.67,59.18,71.4,80.43,85.25,3.27,8,8,8,47,M,JPN,others,Fukuoka,W1
1,"Jeptoo, Rita",16.22,32.58,65.83,69.47,82.43,99.33,116.37,132.1,138.95,5.3,21,1,1,33,F,KEN,others,Eldoret,F1
2,"Van Dyk, Ernst F.",7.75,16.62,36.1,38.03,45.8,56.45,67.42,76.1,80.6,3.08,1,1,1,41,M,RSA,others,Paarl,W2
3,"Dibaba, Mare",16.2,32.57,65.83,69.47,82.43,99.33,116.37,132.95,140.58,5.37,27,3,3,24,F,ETH,others,Shoa,F2
4,"Hokinoue, Kota",8.02,17.12,36.58,38.6,46.37,57.03,67.83,76.72,81.23,3.1,2,2,2,40,M,JPN,others,Nogata Fukuoka,W3
5,"Sumgong, Jemima Jelagat",16.22,32.58,65.83,69.47,82.45,99.33,116.37,132.95,140.68,5.37,28,4,4,29,F,KEN,others,Nandi,F3
6,"Hug, Marcel E.",8.38,17.65,37.65,39.72,47.67,58.6,70.23,79.83,84.65,3.23,4,4,4,28,M,SUI,others,Neuenkirch,W4
7,"Geneti, Markos",15.17,30.48,61.62,64.85,76.95,92.52,107.47,122.8,129.83,4.97,5,5,5,29,M,ETH,others,Addis Ababa,5


In [4]:
df.dtypes

name                    object
5k                     float64
10k                    float64
20k                    float64
half                   float64
25k                    float64
30k                    float64
35k                    float64
40k                    float64
official               float64
pace                   float64
rank_overall             int64
rank_in_gender_only      int64
rank_in_division         int64
age                      int64
gender                  object
country                 object
state                   object
city                    object
bib                     object
dtype: object

# Remove Columns

## Drop 'name' & 'bib'

These values to identiy runners. In data, we identify them by rows only. So, drop them.

In [5]:
df.drop(['name', 'bib'], axis=1, inplace=True)

## Drop 'city'

In [6]:
df.drop(['city'], axis=1, inplace=True)

# Feature Selection

## Features specifiy Individuals

* official
* age
* gender


In [7]:
df = df[['official', 'age', 'gender']]

# Scaling of Numericals

As clustering algorithms are distance-based, so differences of units have a highly affects on algorithms performance. We need to scale numerical columns to a range of 0 to 1.

In [8]:
#to_scale = ['5k', '10k', '20k', 'half', '25k', '30k', '35k', '40k', 'official', 'pace', 'age']
#to_scale = list(df.select_dtypes(exclude='O'))

to_scale = ['official', 'age']
scaler = MinMaxScaler()

#df = df.copy()

df[to_scale] = scaler.fit_transform(df[to_scale])

In [9]:
df.head()

Unnamed: 0,official,age,gender
0,0.012196,0.460317,M
1,0.153041,0.238095,F
2,0.0,0.365079,M
3,0.157316,0.095238,F
4,0.001652,0.349206,M


# One Hot Encoding

This encoding dataset is specified for K-means and DBSCAN clustering.

In [10]:
to_onehot = ['gender']

# to_onehot not includes 'city' as it has around 6000 uniques, which could damages the clustering model.

df_onehot = pd.get_dummies(df, columns=to_onehot)

# Saving to .csv

### Version 1: Non-onehot for KPrototypes

In [11]:
df.head(8)

Unnamed: 0,official,age,gender
0,0.012196,0.460317,M
1,0.153041,0.238095,F
2,0.0,0.365079,M
3,0.157316,0.095238,F
4,0.001652,0.349206,M
5,0.157579,0.174603,F
6,0.010622,0.15873,M
7,0.129121,0.174603,M


In [12]:
df.to_csv('data_Preprocessed_noneOnehot.csv', index=False)

### Version 2: Onehot Encoding for K-means and DBSCAN Clustering

In [13]:
df_onehot.head()

Unnamed: 0,official,age,gender_F,gender_M
0,0.012196,0.460317,0,1
1,0.153041,0.238095,1,0
2,0.0,0.365079,0,1
3,0.157316,0.095238,1,0
4,0.001652,0.349206,0,1


In [14]:
df_onehot.to_csv('data_Preprocessed_Onehot.csv', index=False)