In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [2]:
#First we import the uncleanded but merged dataset
df=pd.read_csv("data/data_uncleaned.csv")
#We need all informations in regard to the amount of games played
df["W_pct"] = (df["W"] / df["G"])
df["PW_pct"] = (df["PW"] / df["G"])
#We drop the columns that have no relevant information about the performance of a team
#Furthermore, we dropped all variables that store absolute values instead of values per game
#because they do not allow to compare seasons with a different amount of games
#Furthermore, we dropped Attend. and Attend./G because they contain missing values
df = df.drop(['Unnamed: 27', 'Unnamed: 22', 'Unnamed: 17', 'Season', '1997_98', 'Arena', 'Attend.','Attend./G',
              'FGA', 'FG', '3P', '3PA', '2P', '2PA', 'FTA', 'FT', 'L', 'PL', 'SRS', 'ORtg', 'DRtg', 'NRtg', 'Pace',
              'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'eFG%.1', 'TOV%.1', 'DRB%', 'FT/FGA.1', 'Season_x', 'Season_y',
              'Rk_y', 'Team', 'W', 'PW', 'G'], axis=1)
#Every team has been ranked in each season between 1 and 30 in every season
#If there is no information in this column, there is no important information in the row at all
#Therefore, we drop all rows that do contain missing values in the Rk_x axis
df=df.dropna(subset=['Rk_x'])
#The information in the column RK_x is not relevant and therefore will be dropped
df = df.drop(['Rk_x'], axis=1)

In [3]:
#We save the cleaned data set
df.to_csv("data/df_cleaned_before_manipulation.csv", index=False)

In [4]:
#Afterwards we found a mistake in our data while analyzing it
#We corrected the mistake manually and checked whether our data set is still good to go

In [5]:
df=pd.read_csv("data/df_cleaned_manipulated.csv")
#Somehow there is a empty column called Unnamed: 0, which we have to drop
df = df.drop(['Unnamed: 0'], axis=1)

In [6]:
#We now safe our final data set as df_cleaned
df.to_csv("data/df_cleaned.csv", index=False)

In [7]:
#For our project we need to evaluate how many principal components we need to explain 85% of the variance of the data
def find_min_k_pca(X, threshold=0.85):
    #Instantiate the PCA object with the number of components being the columns of the input matrix
    pca = PCA(n_components=X.shape[1])
    #Standardize inputs
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    pca.fit(X)
    #Find the index of the first element for which the cumulative variance explained
    #is at least the given threshold
    k = np.argmax(pca.explained_variance_ratio_.cumsum() >= threshold)
    #The function returns the number of principal components we need to explain 85% of the variance
    return k
#We call our function so that it gives us the number of principal components we need to explain 85% of our data
find_min_k_pca(df)

10