In [66]:
import numpy as np
import pandas as pd
import csv
import os

In [67]:
# df = pd.read_csv("csv_files/basketball_2022.csv")
# df.head(5)

In [68]:
# df.info()

In [69]:
#Looks like most of our features have a large number of missing data
#Some of this is expected, not all athletes have instagram, twitter, or tikok as well as NIL deals
#First we should clean the available data into a more coherent structure
#Within our data there are 2 types of missing values, np.nan and '-', we need to handle both of them and convert them to np.nan
#In our age column, for football 2022 one of the unique values is '1819', this is likely an input error in the On3.com database 

In [70]:
# This function take in a dataframe and cleans all needed columns
def clean_cols(df, sport):
    df['POS'] =  [np.nan if (pd.isna(pos) or pos=='-') else pos for pos in df['POS_HEI_WEI'].str.split(' / ').str[0]]
    df['HEIGHT_IN'] = [np.nan if (pd.isna(height) or height=='-') else (float(height.split('-')[0])*12 + float(height.split('-')[1])) for height in df['POS_HEI_WEI'].str.split(' / ').str[1]]
    df['WEIGHT_LBS'] = [np.nan if (pd.isna(weight) or weight=='-') else float(weight) for weight in df['POS_HEI_WEI'].str.split(' / ').str[2]]
    df['SKILL_'] = [np.nan if pd.isna(skill) else skill if type(skill)==float else float(skill[0:-1]) if skill[-1]=='+' else float(skill) for skill in df['SKILL']]
    df['COLLDIST_MI'] = [np.nan if (pd.isna(dist) or dist=='-') else float(dist.split()[0]) for dist in df['COLLDIST']]
    df['NILVAL_LONG_USD'] = [np.nan if pd.isna(nilval) else float(nilval[1:-1])*1000000 if nilval[-1]=='M' else float(nilval[1:-1])*1000 if nilval[-1]=='K' else float(nilval[1:-1]) for nilval in df['NILVAL']]
    df['INSTA_LONG'] = [np.nan if (pd.isna(insta) or insta == '-') else float(insta[0:-1])*1000000 if insta[-1]=='M' else float(insta[0:-1])*1000 if insta[-1]=='K' else float(insta) for insta in df['INSTA']]
    df['TWIT_LONG'] = [np.nan if (pd.isna(twit) or twit == '-') else float(twit[0:-1])*1000000 if twit[-1]=='M' else float(twit[0:-1])*1000 if twit[-1]=='K' else float(twit) for twit in df['TWIT']]
    df['TIK_LONG'] = [np.nan if (pd.isna(tik) or tik == '-') else float(tik[0:-1])*1000000 if tik[-1]=='M' else float(tik[0:-1])*1000 if tik[-1]=='K' else float(tik) for tik in df['TIK']]
    df['STATE'] = [np.nan if (pd.isna(ht) or ht=='-') else ht[-3:] for ht in df['HOTOWN']]
    df['SPORT'] = sport
    return df



In [71]:
directory = 'csv_files'
bball_count = 0
fball_count = 0

for file in os.listdir(directory):
    f = os.path.join(directory, file)
    df = pd.read_csv(f)
    sport = file.split('_')[0]
    if sport == 'basketball':
        if bball_count == 0:
            bball_clean = clean_cols(df,sport)
            bball_count += 1
        else:
            bball_clean = pd.concat([bball_clean,clean_cols(df,sport)])
            bball_count += 1
    elif sport == 'football':
        if fball_count == 0:
            fball_clean = clean_cols(df,sport)
            fball_count += 1
        else:
            fball_clean = pd.concat([fball_clean,clean_cols(df,sport)])
            fball_count += 1
        
print("{} basketball data files cleaned and merged together".format(bball_count))
print("{} footballall data files cleaned and merged together".format(fball_count))
bball_clean.to_csv('csv_files/basketball_clean.csv')
fball_clean.to_csv('csv_files/football_clean.csv')


6 basketball data files cleaned and merged together
6 footballall data files cleaned and merged together


In [72]:
fball_clean.sample(5)

Unnamed: 0,NAME,EXP,POS_HEI_WEI,GRADE,AGE,SKILL,HISCH,HOTOWN,STARCOLL,COLLDIST,...,HEIGHT_IN,WEIGHT_LBS,SKILL_,COLLDIST_MI,NILVAL_LONG_USD,INSTA_LONG,TWIT_LONG,TIK_LONG,STATE,SPORT
1614,Miles Williams,,WR / 6-0 / 170,,,82.05,,,,271 mi.,...,72.0,170.0,82.05,271.0,,,,,,football
392,Davion Gause,,RB / 5-10 / 205,,,89.18,,,North Carolina,690 mi.,...,70.0,205.0,89.18,690.0,,10700.0,2000.0,,,football
79,Shelton Sampson Jr.,2023 - present,WR / 6-3.5 / 190,Freshman,18.0,94.81,Catholic,"Baton Rouge, LA",LSU,0 mi.,...,75.5,190.0,94.81,0.0,144000.0,9900.0,5600.0,,LA,football
126,Billy Schrauth,2022 - present,IOL / 6-4 / 296,Redshirt Freshman,20.0,93.67,St. Mary's Springs,"Fond Du Lac, WI",Notre Dame,180 mi.,...,76.0,296.0,93.67,180.0,,2000.0,2100.0,,WI,football
193,Phil Mafah,2021 - present,RB / 6-1 / 230,Junior,20.0,91.95,Grayson,"Loganville, GA",Clemson,86 mi.,...,73.0,230.0,91.95,86.0,,12300.0,5100.0,1700.0,GA,football


In [73]:
bball_clean.sample(5)

Unnamed: 0,NAME,EXP,POS_HEI_WEI,GRADE,AGE,SKILL,HISCH,HOTOWN,STARCOLL,COLLDIST,...,HEIGHT_IN,WEIGHT_LBS,SKILL_,COLLDIST_MI,NILVAL_LONG_USD,INSTA_LONG,TWIT_LONG,TIK_LONG,STATE,SPORT
29,Jovani Ruff,,SG / 6-4 / 175,,,96.14,,,California,365 mi.,...,76.0,175.0,96.14,365.0,,,,,,basketball
27,Efton Reid,2023 - present,C / 7-0 / 240,Junior,-,97.57,IMG Academy,"Bradenton, FL",LSU,566 mi.,...,84.0,240.0,97.57,566.0,,8300.0,2400.0,,FL,basketball
111,Prince Aligbe,2022 - present,SF / 6-7 / 225,Sophomore,-,92.2,Minnehaha Academy,"Minneapolis, MN",Boston College,1117 mi.,...,79.0,225.0,92.2,1117.0,,11800.0,2200.0,,MN,basketball
306,Damonze Woods,,PF / 6-7 / 210,,,81.0,,,UT-Arlington,322 mi.,...,79.0,210.0,81.0,322.0,,,,,,basketball
25,Jackson Shelstad,2023 - present,PG / 6-0 / 170,Freshman,-,97.03,West Linn,"West Linn, OR",Oregon,90 mi.,...,72.0,170.0,97.03,90.0,100000.0,20000.0,1700.0,,OR,basketball
