In [66]:
import numpy as np
import pandas as pd
import csv
import os

In [67]:
# df = pd.read_csv("csv_files/basketball_2022.csv")
# df.head(5)

In [68]:
# df.info()

In [69]:
#Looks like most of our features have a large number of missing data
#Some of this is expected, not all athletes have instagram, twitter, or tikok as well as NIL deals
#First we should clean the available data into a more coherent structure
#Within our data there are 2 types of missing values, np.nan and '-', we need to handle both of them and convert them to np.nan
#In our age column, for football 2022 one of the unique values is '1819', this is likely an input error in the On3.com database 

#Future Item notes:
# Remove any atheletes who did not go to college (NBA G League)
# 2 seperate cleaned files supervised and unsupervised

In [93]:
# This function take in a dataframe and cleans all needed columns
def clean_cols(df, sport,rec_year):
    df['POS'] =  [np.nan if (pd.isna(pos) or pos=='-') else pos for pos in df['POS_HEI_WEI'].str.split(' / ').str[0]]
    df['HEIGHT_IN'] = [np.nan if (pd.isna(height) or height=='-') else (float(height.split('-')[0])*12 + float(height.split('-')[1])) for height in df['POS_HEI_WEI'].str.split(' / ').str[1]]
    df['WEIGHT_LBS'] = [np.nan if (pd.isna(weight) or weight=='-') else float(weight) for weight in df['POS_HEI_WEI'].str.split(' / ').str[2]]
    df['SKILL_'] = [np.nan if pd.isna(skill) else skill if type(skill)==float else float(skill[0:-1]) if skill[-1]=='+' else float(skill) for skill in df['SKILL']]
    df['COLLDIST_MI'] = [np.nan if (pd.isna(dist) or dist=='-') else float(dist.split()[0]) for dist in df['COLLDIST']]
    df['NILVAL_LONG_USD'] = [np.nan if pd.isna(nilval) else float(nilval[1:-1])*1000000 if nilval[-1]=='M' else float(nilval[1:-1])*1000 if nilval[-1]=='K' else float(nilval[1:-1]) for nilval in df['NILVAL']]
    df['INSTA_LONG'] = [np.nan if (pd.isna(insta) or insta == '-') else float(insta[0:-1])*1000000 if insta[-1]=='M' else float(insta[0:-1])*1000 if insta[-1]=='K' else float(insta) for insta in df['INSTA']]
    df['TWIT_LONG'] = [np.nan if (pd.isna(twit) or twit == '-') else float(twit[0:-1])*1000000 if twit[-1]=='M' else float(twit[0:-1])*1000 if twit[-1]=='K' else float(twit) for twit in df['TWIT']]
    df['TIK_LONG'] = [np.nan if (pd.isna(tik) or tik == '-') else float(tik[0:-1])*1000000 if tik[-1]=='M' else float(tik[0:-1])*1000 if tik[-1]=='K' else float(tik) for tik in df['TIK']]
    df['STATE'] = [np.nan if (pd.isna(ht) or ht=='-') else ht[-3:] for ht in df['HOTOWN']]
    df['SPORT'] = sport
    df['RECRUIT_YEAR'] = rec_year
    return df



In [96]:
directory = 'csv_files'
bball_count = 0
fball_count = 0

for file in os.listdir(directory):
    f = os.path.join(directory, file)
    df = pd.read_csv(f)
    sport = file.split('_')[0]
    year = file.split('_')[-1].split('.')[0]
    if sport == 'basketball':
        if bball_count == 0:
            bball_clean = clean_cols(df,sport,year)
            bball_count += 1
        else:
            bball_clean = pd.concat([bball_clean,clean_cols(df,sport,year)])
            bball_count += 1
    elif sport == 'football':
        if fball_count == 0:
            fball_clean = clean_cols(df,sport,year)
            fball_count += 1
        else:
            fball_clean = pd.concat([fball_clean,clean_cols(df,sport,year)])
            fball_count += 1
        
print("{} basketball data files cleaned and merged together".format(bball_count))
print("{} footballall data files cleaned and merged together".format(fball_count))
bball_clean.to_csv('cleaned_files/basketball_clean.csv')
fball_clean.to_csv('cleaned_files/football_clean.csv')


6 basketball data files cleaned and merged together
6 footballall data files cleaned and merged together


In [97]:
dropnafball = fball_clean.dropna(subset=['NILVAL_LONG_USD'])
dropnafball.sample(25)

Unnamed: 0,NAME,EXP,POS_HEI_WEI,GRADE,AGE,SKILL,HISCH,HOTOWN,STARCOLL,COLLDIST,...,WEIGHT_LBS,SKILL_,COLLDIST_MI,NILVAL_LONG_USD,INSTA_LONG,TWIT_LONG,TIK_LONG,STATE,SPORT,RECRUIT_YEAR
280,R Mason Thomas,2022 - present,DL / 6-1.5 / 215,Sophomore,19,90.38,Cardinal Gibbons,"Fort Lauderdale, FL",Oklahoma,1195 mi.,...,215.0,90.38,1195.0,65000.0,2800.0,3200.0,,FL,football,2022
263,Trey Zuhn,2021 - present,OT / 6-6 / 300,Redshirt Sophomore,20,90.86,Fossil Ridge,"Fort Collins, CO",Texas A&M,848 mi.,...,300.0,90.86,848.0,72000.0,1500.0,1700.0,,CO,football,2021
94,Joseph Phillips,,LB / 6-1.5 / 250,,,93.39,,,Auburn,15 mi.,...,250.0,93.39,15.0,122000.0,850.0,1200.0,110.0,,football,2024
138,Garrett Stover,,LB / 6-0 / 205,,,92.54,,,Ohio State,14 mi.,...,205.0,92.54,14.0,106000.0,2700.0,2800.0,599.0,,football,2024
45,Chase Bisontis,2023 - present,IOL / 6-4.5 / 315,Freshman,19,96.4,Don Bosco Prep,"Ramsey, NJ",Texas A&M,1430 mi.,...,315.0,96.4,1430.0,193000.0,15300.0,5300.0,3100.0,NJ,football,2023
111,Jaden Nickens,,WR / 6-3 / 170,,,92.11,,,Oklahoma,22 mi.,...,170.0,92.11,22.0,77000.0,12700.0,1900.0,4300.0,,football,2025
72,Casey Poe,,IOL / 6-4.5 / 290,,,93.98,,,Alabama,453 mi.,...,290.0,93.98,453.0,122000.0,2200.0,3000.0,101.0,,football,2024
120,Tackett Curtis,2023 - present,LB / 6-2.5 / 220,Freshman,19,93.62,Many,"Many, LA",USC,1443 mi.,...,220.0,93.62,1443.0,148000.0,3700.0,6300.0,1200.0,LA,football,2023
304,Aaron Bryant,2022 - present,DL / 6-3 / 290,Redshirt Freshman,19,90.08,Southaven,"Southaven, MS",Texas,556 mi.,...,290.0,90.08,556.0,168000.0,,2200.0,200.0,MS,football,2022
192,Terhyon Nichols,,CB / 5-10.5 / 185,,,91.44,,,Kentucky,76 mi.,...,185.0,91.44,76.0,78000.0,,1500.0,375.0,,football,2024


In [84]:
dropnabball = bball_clean.dropna(subset=['NILVAL_LONG_USD'])
dropnabball.sample(25)

Unnamed: 0,NAME,EXP,POS_HEI_WEI,GRADE,AGE,SKILL,HISCH,HOTOWN,STARCOLL,COLLDIST,...,HEIGHT_IN,WEIGHT_LBS,SKILL_,COLLDIST_MI,NILVAL_LONG_USD,INSTA_LONG,TWIT_LONG,TIK_LONG,STATE,SPORT
21,Elmarko Jackson,2023 - present,PG / 6-4 / 200,Freshman,19,97.15,South Kent School,"Marlton, NJ",Kansas,1082 mi.,...,76.0,200.0,97.15,1082.0,121000.0,12900.0,2300.0,,NJ,basketball
43,Kobe Bufkin,,SG / 6-4 / 175,,,96.18,Grand Rapids Christian,"Grand Rapids, MI",Michigan,103 mi.,...,76.0,175.0,96.18,103.0,103000.0,13000.0,3600.0,,MI,basketball
49,Kanaan Carlyle,2023 - present,CG / 6-2 / 175,Freshman,-,95.55,Overtime Elite,"Alpharetta, GA",Stanford,2121 mi.,...,74.0,175.0,95.55,2121.0,69000.0,18600.0,634.0,3.0,GA,basketball
208,Hansel Enmanuel,,CG / 6-4 / 175,,,86.00,Life Academy,"Kissimmee, FL",Northwestern State,765 mi.,...,76.0,175.0,86.0,765.0,1200000.0,1600000.0,,,FL,basketball
3,Shaedon Sharpe,,SG / 6-4 / 175,,,99.52+,Dream City Christian,"Glendale, AZ",Kentucky,1577 mi.,...,76.0,175.0,99.52,1577.0,74000.0,,,,AZ,basketball
48,DaRon Holmes,,PF / 6-8 / 195,,-,95.77,Montverde Academy,"Montverde, FL",Dayton,785 mi.,...,80.0,195.0,95.77,785.0,91000.0,12400.0,4100.0,3000.0,FL,basketball
14,Cayden Boozer,,PG / 6-3 / 190,,,97.45,,,Florida,292 mi.,...,75.0,190.0,97.45,292.0,187000.0,40000.0,,15900.0,,basketball
24,Matthew Cleveland,2023 - present,CG / 6-6 / 190,Junior,-,97.77,Pace Academy,"Alpharetta, GA",Florida State,257 mi.,...,78.0,190.0,97.77,257.0,217000.0,13000.0,2900.0,45.0,GA,basketball
17,JJ Starling,2023 - present,SG / 6-4 / 200,Sophomore,-,98.34+,La Lumiere School,"Baldwinsville, NY",Notre Dame,515 mi.,...,76.0,200.0,98.34,515.0,117000.0,15800.0,,2000.0,NY,basketball
8,Alex Constanza,,SG / 6-8 / 175,,,98.05,,,Florida,278 mi.,...,80.0,175.0,98.05,278.0,142000.0,2900.0,278.0,,,basketball


In [78]:
df_schools = pd.read_excel("college_data/Schools.xlsx")
df_schools = df_schools[df_schools['Sports'].isin(['Basketball','Football'])]

In [79]:
df_schools.sample(10)

Unnamed: 0,unitid,OPEID,institution_name,addr1_txt,addr2_txt,city_txt,state_cd,zip_text,ClassificationCode,classification_name,...,EXP_MEN,EXP_WOMEN,TOTAL_EXP_MENWOMEN,EXP_COED_MEN,EXP_COED_WOMEN,TOTAL_EXP_COED,EXPENSE_MENALL,EXPENSE_WOMENALL,TOTAL_EXPENSE_ALL,Sports
15847,232089,371100,Ferrum College,215 Ferrum Mountain Road,,Ferrum,VA,240889000.0,6,NCAA Division III with football,...,167816.0,190244.0,358060.0,,,,167816,190244,358060,Basketball
9826,190600,269300,CUNY John Jay College of Criminal Justice,524 W 59th St,,New York,NY,10019.0,7,NCAA Division III without football,...,48154.0,47915.0,96069.0,,,,48154,47915,96069,Basketball
11522,199971,3570300,Carolina Christian College,4209 Indiana Avenue,,Winston Salem,NC,27105.0,8,Other,...,380548.0,325691.0,706239.0,,,,380548,325691,706239,Basketball
14375,219082,346300,Dakota State University,820 N Washington Ave,,Madison,SD,570421799.0,9,NAIA Division I,...,291766.0,307881.0,599647.0,,,,291766,307881,599647,Basketball
3135,137564,152100,Southeastern University,1000 Longfellow Blvd,,Lakeland,FL,338016034.0,9,NAIA Division I,...,1869226.0,,1869226.0,,,,1869226,0,1869226,Football
4472,150668,179900,Goshen College,1700 S Main St,,Goshen,IN,465264794.0,10,NAIA Division II,...,331265.0,438853.0,770118.0,,,,331265,438853,770118,Basketball
6475,164270,210900,McDaniel College,2 College Hill,,Westminster,MD,21157.0,6,NCAA Division III with football,...,152185.0,154801.0,306986.0,,,,152185,154801,306986,Basketball
12384,207263,316100,Northeastern State University,600 N Grand,,Tahlequah,OK,744642399.0,4,NCAA Division II with football,...,1597573.0,,1597573.0,,,,1597573,0,1597573,Football
104,101189,100300,Faulkner University,5345 Atlanta Hwy,,Montgomery,AL,361093390.0,9,NAIA Division I,...,1682989.0,,1682989.0,,,,1682989,0,1682989,Football
13737,216357,337600,Thiel College,75 College Ave,,Greenville,PA,161252181.0,6,NCAA Division III with football,...,132705.0,133919.0,266624.0,,,,132705,133919,266624,Basketball
