In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS, cross_origin
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mainfold import MDS
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'sklearn.mainfold'

In [2]:
data = pd.read_csv('../data/basketball_data.csv')
# removing the non-numerical variables from the dataframe
df = data.drop('team', axis=1)
# forming a dictionary to map each column to their index
all_columns = {}
for index, item in enumerate(df.columns):
    all_columns[item] = index

In [3]:
df

Unnamed: 0,# of games played,# of games won,offensive_efficiency,defensive_efficiency,power_rating,eff_goal_per_shot,eff_goal_per_allowed,turnover_per_allowed,turnover_per_commited,offensive_rebound_rate,offensive_rebound_allowed,free_throw_rate,free_throw_allowed,two_point_shooting,two_point_allowed,three_point,three_point_allowed,adjusted_tempo,wins_above_bubble
0,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,18.2,40.7,30.0,32.3,30.4,53.9,44.6,32.7,36.2,71.7,8.6
1,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,15.8,32.1,23.7,36.2,22.4,54.8,44.7,36.5,37.5,59.3,11.3
2,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,19.5,25.5,24.9,30.7,30.0,54.7,46.8,35.2,33.2,65.9,6.9
3,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,22.8,27.4,28.7,32.9,36.6,52.8,41.9,36.5,29.7,67.5,7.0
4,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,17.1,30.0,26.2,39.0,26.9,56.3,40.0,38.2,29.0,71.5,7.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1752,35,22,111.2,94.7,0.8640,51.4,46.9,19.2,15.3,33.9,27.3,32.0,27.6,52.5,45.7,32.9,32.6,70.3,1.9
1753,35,28,117.9,96.6,0.9081,51.2,49.9,17.9,20.1,36.7,30.8,37.1,33.1,52.9,49.4,31.9,33.7,71.2,7.3
1754,36,31,122.8,95.2,0.9488,55.3,48.1,15.8,18.0,31.6,30.2,33.3,34.9,55.4,44.7,36.7,35.4,68.8,9.9
1755,35,27,117.4,94.5,0.9238,55.2,44.8,17.1,15.1,32.1,26.0,34.4,28.1,54.3,44.4,37.8,30.3,68.2,2.1


In [4]:
all_columns

{'# of games played': 0,
 '# of games won': 1,
 'offensive_efficiency': 2,
 'defensive_efficiency': 3,
 'power_rating': 4,
 'eff_goal_per_shot': 5,
 'eff_goal_per_allowed': 6,
 'turnover_per_allowed': 7,
 'turnover_per_commited': 8,
 'offensive_rebound_rate': 9,
 'offensive_rebound_allowed': 10,
 'free_throw_rate': 11,
 'free_throw_allowed': 12,
 'two_point_shooting': 13,
 'two_point_allowed': 14,
 'three_point': 15,
 'three_point_allowed': 16,
 'adjusted_tempo': 17,
 'wins_above_bubble': 18}

In [5]:
# StandardScaler follows normal distribution so mean=0 & variance=1
# stdscore of sample x: z=(x-u)/s, where u is mean and s is standard deviation
scaler = StandardScaler()
# Compute the mean and std to be used for later scaling.
scaler.fit(df)
# Perform standardization by centering and scaling
scaled_data = scaler.transform(df)
scaled_data

array([[ 3.25776096,  2.51946228,  2.70544705, ...,  0.61425725,
         1.00606322,  2.35262669],
       [ 3.25776096,  2.97791797,  3.49965254, ...,  1.16299981,
        -2.79996183,  2.73907495],
       [ 3.25776096,  2.51946228,  1.48675242, ..., -0.65207174,
        -0.7741743 ,  2.10930742],
       ...,
       [ 1.72052825,  2.21382516,  2.63698106, ...,  0.27656952,
         0.11594446,  2.53869437],
       [ 1.33622007,  1.60255092,  1.89754836, ..., -1.87618976,
        -0.06821804,  1.42228829],
       [ 2.10483643,  2.36664372,  1.87016197, ...,  0.10772565,
         0.63773822,  1.82304945]])

In [6]:
number_of_components = len(list(df))
pca = PCA(n_components=number_of_components)
principal_components = pca.fit_transform(df)
principal_components

array([[ 3.33046945e+01, -8.05370500e-01,  4.38743017e+00, ...,
        -3.36931513e-01, -5.12797073e-02, -1.26444803e-01],
       [ 4.17207660e+01, -7.81696283e+00,  1.24622831e-01, ...,
         2.00783128e-01,  8.37848966e-01, -2.35104384e-01],
       [ 2.96434203e+01, -2.55275446e+00, -5.64591950e+00, ...,
         1.64288175e-01,  1.09704637e-01, -5.93303349e-02],
       ...,
       [ 3.15735816e+01, -1.96518115e+00,  5.88581381e+00, ...,
         1.03895508e-01, -3.52528034e-01, -1.21957450e-01],
       [ 2.58109679e+01, -3.18839013e+00, -1.91918445e+00, ...,
         5.24028083e-02, -5.38347572e-02, -3.91793396e-02],
       [ 2.97099063e+01, -5.15557747e+00,  9.79710662e-02, ...,
         5.53175705e-02,  2.54344860e-02, -3.17043621e-02]])

In [8]:
pca1, pca2 = [], []

for i in range(principal_components.shape[0]):
    pca1.append(principal_components[i][0])
    pca2.append(principal_components[i][1])

df_pca_attributes = pd.DataFrame(
    list(zip(pca.components_[0], pca.components_[1])), 
    columns=['PCA1 Attribute', 'PCA2 Attribute']
)
df_pca = pd.DataFrame(list(zip(pca1, pca2)), columns=['PCA1', 'PCA2'])
df_pca

Unnamed: 0,PCA1,PCA2
0,33.304694,-0.805371
1,41.720766,-7.816963
2,29.643420,-2.552754
3,30.763463,7.526975
4,38.993446,1.891494
...,...,...
1752,18.363415,-1.146462
1753,24.444665,1.635463
1754,31.573582,-1.965181
1755,25.810968,-3.188390


In [11]:
minmax = MinMaxScaler(feature_range=(-1, 1))
df_pca = minmax.fit_transform(df_pca)
df_pca_attributes = minmax.fit_transform(df_pca_attributes)
print(df_pca)

print(df_pca_attributes)

[[ 0.76970746 -0.07226263]
 [ 0.97479154 -0.38121189]
 [ 0.68048898 -0.14925698]
 ...
 [ 0.72752344 -0.12336695]
 [ 0.58709922 -0.17726477]
 [ 0.68210913 -0.26394423]]
[[ 0.17508925 -0.1015946 ]
 [ 0.87646607 -0.09691466]
 [ 0.93991042 -0.73080411]
 [-1.         -1.        ]
 [-0.09644538 -0.15567394]
 [ 0.19810233 -0.69104343]
 [-0.44352399 -0.61367979]
 [-0.30092727  0.05472936]
 [-0.12727613  0.1285052 ]
 [ 0.0753711   0.46442191]
 [-0.32591995 -0.04289683]
 [-0.04020303  0.58646696]
 [-0.54568383  1.        ]
 [ 0.20840926 -0.64483279]
 [-0.46058588 -0.65104801]
 [ 0.07179746 -0.54450478]
 [-0.31841299 -0.40814194]
 [-0.1675865  -0.43247618]
 [ 1.          0.01655362]]


In [12]:
variance = pca.explained_variance_
variance

array([1.84586269e+02, 4.91283064e+01, 2.85531752e+01, 1.95209350e+01,
       1.46607168e+01, 1.03253296e+01, 9.34837471e+00, 8.51930160e+00,
       6.55451454e+00, 6.01001575e+00, 4.08559767e+00, 3.75746367e+00,
       3.38191951e+00, 1.79118508e+00, 5.85734551e-01, 4.64535541e-01,
       3.93379633e-02, 2.08475492e-02, 1.79571571e-03])

In [13]:
eigen_ratio = (pca.explained_variance_ratio_) * 100
eigen_ratio

array([5.25384838e+01, 1.39833084e+01, 8.12704293e+00, 5.55621138e+00,
       4.17285551e+00, 2.93888145e+00, 2.66081240e+00, 2.42483469e+00,
       1.86560061e+00, 1.71062082e+00, 1.16287689e+00, 1.06948065e+00,
       9.62590144e-01, 5.09822040e-01, 1.66716654e-01, 1.32219981e-01,
       1.11966993e-02, 5.93380336e-03, 5.11111586e-04])

In [15]:
scree_data = []
cumulative_sum = 0
for i in range(len(eigen_ratio)):
    cumulative_sum = (cumulative_sum + eigen_ratio[i]).round(2)
    scree_data.append({
        'factor': i + 1,
        'eigenvalue': eigen_ratio[i].round(2),
        'cumulative': cumulative_sum.round(2)
    })
scree_data

[{'factor': 1, 'eigenvalue': 52.54, 'cumulative': 52.54},
 {'factor': 2, 'eigenvalue': 13.98, 'cumulative': 66.52},
 {'factor': 3, 'eigenvalue': 8.13, 'cumulative': 74.65},
 {'factor': 4, 'eigenvalue': 5.56, 'cumulative': 80.21},
 {'factor': 5, 'eigenvalue': 4.17, 'cumulative': 84.38},
 {'factor': 6, 'eigenvalue': 2.94, 'cumulative': 87.32},
 {'factor': 7, 'eigenvalue': 2.66, 'cumulative': 89.98},
 {'factor': 8, 'eigenvalue': 2.42, 'cumulative': 92.4},
 {'factor': 9, 'eigenvalue': 1.87, 'cumulative': 94.27},
 {'factor': 10, 'eigenvalue': 1.71, 'cumulative': 95.98},
 {'factor': 11, 'eigenvalue': 1.16, 'cumulative': 97.14},
 {'factor': 12, 'eigenvalue': 1.07, 'cumulative': 98.21},
 {'factor': 13, 'eigenvalue': 0.96, 'cumulative': 99.17},
 {'factor': 14, 'eigenvalue': 0.51, 'cumulative': 99.68},
 {'factor': 15, 'eigenvalue': 0.17, 'cumulative': 99.85},
 {'factor': 16, 'eigenvalue': 0.13, 'cumulative': 99.98},
 {'factor': 17, 'eigenvalue': 0.01, 'cumulative': 99.99},
 {'factor': 18, 'eigen

In [16]:
biplot_data_x, biplot_data_y = [], []
for i in range(len(df_pca)):
    biplot_data_x.append({
        'pca1': df_pca[i][0],
        'pca2': df_pca[i][1]
    })

for i in range(len(df_pca_attributes)):
    biplot_data_y.append({
        'pca1_attributes': df_pca_attributes[i][0],
        'pca2_attributes': df_pca_attributes[i][1]
    })

In [17]:
biplot_data_x

[{'pca1': 0.7697074582147603, 'pca2': -0.07226263316164853},
 {'pca1': 0.974791536561351, 'pca2': -0.38121189053324317},
 {'pca1': 0.6804889841007147, 'pca2': -0.1492569807749136},
 {'pca1': 0.7077823409583478, 'pca2': 0.2948824758229561},
 {'pca1': 0.9083317983356413, 'pca2': 0.04656833238626139},
 {'pca1': 0.9450950920495472, 'pca2': -0.1647934506721267},
 {'pca1': 0.9049197728631686, 'pca2': -0.21845553028977974},
 {'pca1': 0.766272936494168, 'pca2': 0.07460843197022721},
 {'pca1': 0.8510715943630545, 'pca2': -0.16451217229710013},
 {'pca1': 0.941433084388196, 'pca2': -0.580380983534576},
 {'pca1': 0.5295532013564994, 'pca2': 0.5432577785173007},
 {'pca1': 0.7921809542324462, 'pca2': -0.5261182313411473},
 {'pca1': 0.44321125333277395, 'pca2': -0.4502573984585763},
 {'pca1': 0.7044471274550056, 'pca2': -0.08283434379170171},
 {'pca1': 0.7695093731464323, 'pca2': -0.2763797741798376},
 {'pca1': 0.394919239581517, 'pca2': -0.03264671750525689},
 {'pca1': 0.835885697716728, 'pca2': -0.

In [18]:
biplot_data_y

[{'pca1_attributes': 0.17508924911304422,
  'pca2_attributes': -0.1015946041280764},
 {'pca1_attributes': 0.8764660684757772,
  'pca2_attributes': -0.09691465913332445},
 {'pca1_attributes': 0.9399104232042821,
  'pca2_attributes': -0.7308041078519233},
 {'pca1_attributes': -1.0, 'pca2_attributes': -1.0},
 {'pca1_attributes': -0.09644538221561481,
  'pca2_attributes': -0.15567393963080522},
 {'pca1_attributes': 0.19810233487977186,
  'pca2_attributes': -0.6910434319090708},
 {'pca1_attributes': -0.44352399049682045,
  'pca2_attributes': -0.6136797865736565},
 {'pca1_attributes': -0.3009272729552892,
  'pca2_attributes': 0.05472936117555338},
 {'pca1_attributes': -0.12727612601578817,
  'pca2_attributes': 0.12850519604748262},
 {'pca1_attributes': 0.07537110089716605,
  'pca2_attributes': 0.4644219080697174},
 {'pca1_attributes': -0.3259199519691745,
  'pca2_attributes': -0.04289683486585358},
 {'pca1_attributes': -0.040203028258243144,
  'pca2_attributes': 0.5864669586718994},
 {'pca1_

In [19]:
pca.components_

array([[ 1.37974835e-01,  4.48090305e-01,  4.76142381e-01,
        -3.81593740e-01,  1.79151363e-02,  1.48150127e-01,
        -1.35546516e-01, -7.24970299e-02,  4.28324782e-03,
         9.38842130e-02, -8.35476039e-02,  4.27828302e-02,
        -1.80716737e-01,  1.52707358e-01, -1.43090470e-01,
         9.23041183e-02, -8.02283816e-02, -1.35400832e-02,
         5.02711134e-01],
       [ 2.77456753e-02,  2.98929504e-02, -2.60951279e-01,
        -3.84465043e-01,  2.93273249e-03, -2.42708093e-01,
        -2.07211730e-01,  9.94709948e-02,  1.33321181e-01,
         2.87448115e-01,  5.46776703e-02,  3.43445417e-01,
         5.33184651e-01, -2.21505504e-01, -2.24357198e-01,
        -1.75472520e-01, -1.12905861e-01, -1.24071013e-01,
         8.19550156e-02],
       [-1.83295753e-02,  5.49526682e-02,  3.94430479e-01,
         4.06435052e-01, -7.64923083e-04,  1.54858783e-01,
         1.84181330e-01, -2.98686900e-02,  3.79316863e-02,
         1.62330643e-01,  1.14970981e-01,  4.32745260e-01,
    