In [None]:
import pandas as pd
from glob import glob
from math import isnan

SCHOOL_FIXES = {
    'Okstate': 'OKState',
    'lsu': 'LSU'
}
def get_file(input_file):
    df = pd.read_csv(filepath_or_buffer=input_file)
    school = input_file.split('_')[-1].replace('.csv', '')
    if school in SCHOOL_FIXES.keys():
        school = SCHOOL_FIXES[school]
        df['school'] = school
    else:
        df['school'] = school.capitalize() 
    return df

def fix_int(arg, fill_value):
    if isinstance(arg, str):
        arg = arg.strip()
        if ',' in arg:
            return int(arg.strip().split(',')[0])
        else:
            return int(arg)
    if isnan(arg):
        return fill_value
    return int(arg)
    

files = list(glob(pathname='/kaggle/input/draft-picks-from-each-university/*.csv'))
df = pd.concat(objs=[get_file(input_file=input_file) for input_file in files])
df['round'] = df['Rd'].apply(func=fix_int, args=(20,)) # fill in 20 for NaN and fix comma values arbitrarily
df['overall'] = df['Overall'].apply(func=fix_int, args=(222,)) # fill in 222 for NaN
df.sample(n=5)

In [None]:
df.info()

In [None]:
from plotly.express import scatter
scatter(data_frame=df, x='G', y='MP', color='PTS', hover_name='Player', hover_data=['school'], log_x=True, log_y=True)

Not surprisingly minutes played tracks games played more or less linearly (on a log-log plot).

In [None]:
scatter(data_frame=df, x='G', y='PTS', color='round', hover_name='Player', hover_data=['school'], log_x=True, log_y=True)

Similarly points follows games played linearly (on a log-log plot), and not surprisingly almost no one drafted outside the first couple of rounds makes much of an impact.

In [None]:
scatter(data_frame=df, x='G', y='PTS', color='overall', hover_name='Player', hover_data=['school'], log_x=True, log_y=True)

In [None]:
scatter(data_frame=df, x='G', y='PTS', color='school', hover_name='Player', hover_data=['overall'], log_x=True, log_y=True)

We really can't tease out the impact of being part of a marquee program from this dataset but it is interesting to see how players from all schools end up in different parts of the distribution.

In [None]:
scatter(data_frame=df, x='FT', y='3P', color='overall', hover_name='Player', hover_data=['school'], log_x=False, log_y=False, trendline='ols')

Scoring point one at a time vs. three at a time is not strongly correlated.

In [None]:
scatter(data_frame=df, x='FT', y='3P', color='overall', hover_name='Player', hover_data=['school'], log_x=True, log_y=True, trendline='ols')

In [None]:
from plotly.express import histogram
histogram(data_frame=df, x='G', color='school')

In [None]:
histogram(data_frame=df, x='PTS', color='school', log_y=True)