In [9]:
import pandas as pd
import seaborn as sns
import numpy as np
from pathlib import Path

In [32]:
year = input("Which year should we visualize? ")
if year == "all":
    dfs = []
    for path in Path("data").glob("2*.csv"):
        with path.open() as f:
            df = pd.read_csv(f)
            dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)
    df = df.drop("Rk", axis=1)
    df.reset_index()
else:
    year = int(year)
    year_plus_1 = str((year + 1) % 100)[-2:]
    year_str = f"{year}{year_plus_1}"
    print(f"Load year {year_str}...")
    df = pd.read_csv(f"data/{year_str}.csv")
    df = df.drop("Rk", axis=1)
df


Unnamed: 0,Player,Age,Tm,Pos,GP,G,A,PTS,+/-,PIM,...,S,S%,TOI,ATOI,BLK,HIT,FOW,FOL,FO%,-9999
0,Justin Abdelkader\abdelju01,31,DET,LW,71,6,13,19,-14,38,...,95,6.3,1093,15:24,34.0,185.0,52.0,51.0,50.5,
1,Pontus Aberg\abergpo01,25,TOT,LW,59,12,13,25,-14,20,...,101,11.9,861,14:36,11.0,45.0,2.0,17.0,10.5,
2,Pontus Aberg\abergpo01,25,ANA,LW,37,11,8,19,-10,14,...,74,14.9,578,15:37,7.0,31.0,2.0,9.0,18.2,
3,Pontus Aberg\abergpo01,25,MIN,LW,22,1,5,6,-4,6,...,27,3.7,283,12:52,4.0,14.0,0.0,8.0,0.0,
4,Vitaly Abramov\abramvi01,20,OTT,RW,1,0,0,0,-3,0,...,0,,14,13:52,1.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17446,Mike Zigomanis\zigommi01,25,CAR,C,21,1,0,1,1,4,...,16,6.3,198,9:25,,,,,,
17447,Mike Zigomanis\zigommi01,25,STL,C,2,0,0,0,0,0,...,1,0.0,15,7:39,,,,,,
17448,Sergei Zubov*\zubovse01,35,DAL,D,78,13,58,71,20,46,...,141,9.2,2063,26:27,,,,,,
17449,Dainius Zubrus\zubruda01,27,WSH,RW,71,23,34,57,3,84,...,181,12.7,1446,20:22,,,,,,


In [22]:
# Test whether an array consists of integers
def check_integer(x):
    x_rounded = np.round(x)
    return np.all(x_rounded == x)

# Test whether an array has only nonnegative elements
def check_nonnegative(x):
    x = np.array(x)
    return np.all(x >= 0)

# Test
check_integer([1.0,23]), check_integer([1.1,23]), check_nonnegative([1.1,23]), check_nonnegative([1.1,-3])

(True, False, True, False)

In [29]:
stat_rows = []
for column in df.columns:
    vals = np.array(df[column])
    if np.issubdtype(vals.dtype, np.number):
        df_not_null = df[df[column].notnull()]
        vals = np.array(df_not_null[column])
        unique, counts = np.unique(vals, return_counts=True)
        value_counts = dict(zip(unique, counts))
        zero_counts = value_counts.get(0, 0)
        stat_rows.append([column, check_integer(vals), check_nonnegative(vals),
                          len(df_not_null), np.mean(vals), np.std(vals), np.max(vals), np.min(vals),
                          zero_counts / len(df_not_null)])
    else:
        pass
        #print("Non-numeric type")
def get_model(integer, nonnegative, max, min):
    # Count value
    if integer and nonnegative:
        if min < 2:
            return "neg-binomial"
        else:
            return "gaussian"
    
    # Percentage score
    if not integer and nonnegative and max == 100:
        return "beta"
    
    # Nonnegative
    if not integer and nonnegative:
        return "lognormal"
    
    # Default
    return "gaussian"

stat_df = pd.DataFrame(stat_rows, columns = ["stat", "integer", "nonnegative", "len", "mean", "std", "max", "min", "zeros"])
stat_df["model"] = stat_df.apply(lambda row: get_model(row["integer"], row["nonnegative"], row["max"], row["min"]), axis=1)
stat_df

Unnamed: 0,stat,integer,nonnegative,len,mean,std,max,min,zeros,model
0,Age,True,True,17451,26.704888,4.452052,48.0,18.0,0.0,gaussian
1,GP,True,True,17451,44.972724,27.99599,84.0,1.0,0.0,neg-binomial
2,G,True,True,17451,6.931064,8.535042,65.0,0.0,0.224457,neg-binomial
3,A,True,True,17451,11.812217,12.729524,96.0,0.0,0.150708,neg-binomial
4,PTS,True,True,17451,18.743281,20.10678,128.0,0.0,0.118274,neg-binomial
5,+/-,True,False,17451,-0.566329,9.25895,64.0,-47.0,0.094321,gaussian
6,PIM,True,True,17451,27.10051,28.74936,324.0,0.0,0.104922,neg-binomial
7,PS,False,False,17451,2.230944,2.670102,17.2,-1.9,0.069795,gaussian
8,EV,True,True,17451,5.107214,6.01634,48.0,0.0,0.240617,neg-binomial
9,PP,True,True,17451,1.615609,2.963742,27.0,0.0,0.594293,neg-binomial


In [34]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
print(df.columns)
def on_change(x):
    sns.set_theme()
    return sns.displot(df, x=x, kde=True)

interact(on_change, x=list(df.columns)[1:])

Index(['Player', 'Age', 'Tm', 'Pos', 'GP', 'G', 'A', 'PTS', '+/-', 'PIM', 'PS',
       'EV', 'PP', 'SH', 'GW', 'EV.1', 'PP.1', 'SH.1', 'S', 'S%', 'TOI',
       'ATOI', 'BLK', 'HIT', 'FOW', 'FOL', 'FO%', '-9999'],
      dtype='object')


interactive(children=(Dropdown(description='x', options=('Age', 'Tm', 'Pos', 'GP', 'G', 'A', 'PTS', '+/-', 'PI…

<function __main__.on_change(x)>