In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
import re

# demos=pd.read_csv('demographics.csv')
# demos.drop(demos[demos['Season']==2016].index, inplace=True)
# demos.drop(columns='Unnamed: 0', inplace=True, axis=1)
# demos.info()


I notice that for 2016, the column contain the names of positions instead of numbers. I am going to use height16edits.ipynb to make changes to that year and re-insert the data.

The formatting needed to combine the datasets is done, so we can remove the code from our runs.

In [2]:
# sixteen=pd.read_csv('sixteen.csv')
# sixteen=sixteen.drop(columns='Unnamed: 0',axis=1)
# print(demos.columns)
# print(sixteen.columns)

In [3]:

# cols_sixteen = set(sixteen.columns)
# cols_demos = set(demos.columns)

# only_in_sixteen = cols_sixteen - cols_demos
# only_in_demos = cols_demos - cols_sixteen

# print("Columns in sixteen but not in demos:", only_in_sixteen)
# print("Columns in demos but not in sixteen:", only_in_demos)
# demos=demos.drop(columns=only_in_demos, axis=1)

In [4]:
# demos.head()

In [5]:
# demofinal=pd.merge(sixteen, demos, on=['Season','TeamName','Size','SizeRank','Hgt5','Hgt5Rank','Hgt4','Hgt4Rank','Hgt3','Hgt3Rank','Hgt2','Hgt2Rank','Hgt1','Hgt1Rank','HgtEff','HgtEffRank','Exp','ExpRank','Bench','BenchRank','Pts5','Pts5Rank','Pts4','Pts4Rank','Pts3','Pts3Rank','Pts2','Pts2Rank','Pts1','Pts1Rank','OR5','OR5Rank','OR4','OR4Rank','OR3','OR3Rank','OR2','OR2Rank','OR1','OR1Rank','DR5','DR5Rank','DR4','DR4Rank','DR3','DR3Rank','DR2','DR2Rank','DR1','DR1Rank'], how='outer')

In [6]:
# demofinal.to_csv('finalheights.csv')

In [7]:
demofinal=pd.read_csv('finalheights.csv')

In [8]:
demofinal=demofinal[demofinal.columns.drop(list(demofinal.filter(regex='Rank')))]

In [9]:
# demofinal.drop(columns='Unnamed: 0')

In [None]:
demofinal.info()

In [11]:
demofinal=demofinal[demofinal['Size'] !=0]
demofinal.drop(columns='Unnamed: 0', inplace=True, axis=1)

We have 

In [None]:
numerical=demofinal.select_dtypes(exclude='object')
for col in numerical:
    fig, ax=plt.subplots()
    ax.boxplot(numerical[col])
    ax.set_title(f'Boxplot for {col}')
    ax.set_xlabel('Data')
    ax.set_ylabel('Values')
plt.show()

In [None]:
for col in numerical:
    top_10=demofinal.nlargest(10, col)
    for index,row in top_10.iterrows():
        top_10.at[index, 'NameSeason']= f"{row['TeamName']} {row['Season']}"
    sns.barplot(x=top_10[col],y=top_10['NameSeason'])
    plt.title(f'Top 10 teams by {col}')
    plt.xlabel(f'{col}')
    plt.show()


In [None]:
for col in numerical:
    top_10=demofinal.nsmallest(10, col)
    for index,row in top_10.iterrows():
        top_10.at[index, 'NameSeason']= f"{row['TeamName']} {row['Season']}"
    sns.barplot(x=top_10[col],y=top_10['NameSeason'])
    plt.title(f'Bottom 10 teams by {col}')
    plt.xlabel(f'{col}')
    plt.show()

We run into the same problem as before with teams that opted out of the pandemic year.
Because those teams do not have rosters (and therefore Size values of 0.0), we can easily drop them.

In [15]:
demofinal=demofinal[demofinal['Size'] > 0.0]

In [None]:
demofinal.info()

Skewness and Kurtosis of Data

In [None]:
numerical=demofinal.select_dtypes(exclude='object')
numerical.skew(axis=0, skipna=True)

In [None]:
numerical.kurtosis(axis=0,skipna=True)

The issues in this dataset appear to be arising from the Offensive and Defensive Rebounding columns; this may be caused by outliers, which have been appearing in the visualizations.
I may have to run IQR drops on the offending columns to correct the issue.

Correlation Analysis, broken up by relevant information clusters (experience, height, rebounding numbers)

In [None]:
mask=numerical.columns.str.contains('Hgt|Exp|Bench', regex=True)
masktwo=numerical.columns.str.contains('Pts', regex=True)
maskthree=numerical.columns.str.contains('OR|DR',regex=True)
height_exp=numerical.loc[:,mask]
Pts=numerical.loc[:,masktwo]
Res=numerical.loc[:,maskthree]

for x in (height_exp,Pts,Res):
    pearson=x.corr(method='pearson')
    spearman=x.corr(method='spearman')
    plt.figure(figsize=(10,8))
    sns.heatmap(pearson, annot=True, cmap='coolwarm')
    plt.title('Pearson Correlation Matrix')
    plt.show()  

    plt.figure(figsize=(10,8))
    sns.heatmap(spearman, annot=True, cmap='coolwarm')
    plt.title('Spearman Correlation Matrix')
    plt.show()

Variance Inflation Factor for each subset of the demographic data

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
for x in (height_exp,Pts,Res):
    vif_data=pd.DataFrame()
    vif_data['features']=x.columns
    vif_data['VIF']=[variance_inflation_factor(x.values,i) for i in range(len(x.columns))]
    print(vif_data)

    vif_data = vif_data.sort_values(by="VIF", ascending=False)


    plt.figure(figsize=(10, 6))
    plt.barh(vif_data["features"], vif_data["VIF"], color="skyblue")
    plt.xlabel("Variance Inflation Factor (VIF)")
    plt.ylabel("Feature")
    plt.title("VIF Analysis for Multicollinearity")
    plt.gca().invert_yaxis()  
    plt.show()

Multivariate Analysis 

In [21]:
demofinal['Team/Year']=demofinal['Season'].astype(str) + ' ' +demofinal['TeamName']

demofinal=demofinal.drop(columns=['Season','TeamName'], axis=1)

In [22]:
popper=demofinal.pop('Team/Year')
demofinal.insert(0,'Team/Year',popper)

In [None]:
import plotly.express as px
fig = px.scatter(
demofinal,
x='HgtEff',
y='Exp',
hover_name='Team/Year',
title='relationship between effective height and average experience'
)
fig.show()

In [None]:
fig = px.scatter(
demofinal,
x='HgtEff',
y='Hgt3',
hover_name='Team/Year',
title='Finding teams whose effective height are most closely tied to the heights of their small forwards'
)
fig.show()