In [14]:
import numpy as np
import pandas as pd
import csv

In [2]:
df = pd.read_csv("csv_files/football_2022.csv")
df.head(5)

Unnamed: 0,NAME,EXP,POS_HEI_WEI,GRADE,AGE,SKILL,HISCH,HOTOWN,STARCOLL,COLLDIST,NUMOFF,INSTA,TWIT,TIK,NILVAL
0,Walter Nolen,2022 - present,DL / 6-4 / 345,Sophomore,19,99.59+,Powell,"Powell, TN",Texas A&M,803 mi.,25.0,21K,11.1K,515,$85K
1,Travis Hunter,2023 - present,CB / 6-1 / 165,Sophomore,20,99.56+,Collins Hill,"Suwanee, GA",Jackson State,375 mi.,17.0,997K,56K,713K,$2.2M
2,Luther Burden,2022 - present,WR / 5-11.5 / 198,Sophomore,19,98.80+,East St. Louis,"Saint Louis, MO",Missouri,111 mi.,25.0,46K,13.2K,10.3K,$429K
3,Sonny Styles,2022 - present,ATH / 6-4 / 205,Sophomore,18,98.73+,Pickerington Central,"Pickerington, OH",Ohio State,20 mi.,24.0,25K,9.1K,,$114K
4,Domani Jackson,2022 - present,CB / 6-1 / 181,Sophomore,20,98.57+,Mater Dei,"Santa Ana, CA",USC,27 mi.,25.0,29K,6.1K,375,$88K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118 entries, 0 to 2117
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   NAME         2118 non-null   object 
 1   EXP          1609 non-null   object 
 2   POS_HEI_WEI  2118 non-null   object 
 3   GRADE        1609 non-null   object 
 4   AGE          1709 non-null   object 
 5   SKILL        2115 non-null   object 
 6   HISCH        1816 non-null   object 
 7   HOTOWN       1816 non-null   object 
 8   STARCOLL     1937 non-null   object 
 9   COLLDIST     1913 non-null   object 
 10  NUMOFF       1913 non-null   float64
 11  INSTA        1094 non-null   object 
 12  TWIT         1352 non-null   object 
 13  TIK          590 non-null    object 
 14  NILVAL       283 non-null    object 
dtypes: float64(1), object(14)
memory usage: 248.3+ KB


In [4]:
#Looks like most of our features have a large number of missing data
#Some of this is expected, not all athletes have instagram, twitter, or tikok as well as NIL deals
#First we should clean the available data into a more coherent structure
#Within our data there are 2 types of missing values, np.nan and '-', we need to handle both of them and convert them to np.nan
#In our age column, for football 2022 one of the unique values is '1819', this is likely an input error in the On3.com database 

In [5]:
df['POS'] =  df['POS_HEI_WEI'].str.split(' / ').str[0]
df['HEIGHT_IN'] = [np.nan if (pd.isna(height) or height=='-') else (float(height.split('-')[0])*12 + float(height.split('-')[1])) for height in df['POS_HEI_WEI'].str.split(' / ').str[1]]
df['WEIGHT_LBS'] = [np.nan if (pd.isna(weight) or weight=='-') else float(weight) for weight in df['POS_HEI_WEI'].str.split(' / ').str[2]]
df['SKILL_'] = [np.nan if pd.isna(skill) else float(skill[0:-1]) if skill[-1]=='+' else float(skill) for skill in df['SKILL']]
df['COLLDIST_MI'] = [np.nan if (pd.isna(dist) or dist=='-') else float(dist.split()[0]) for dist in df['COLLDIST']]
df['NILVAL_LONG_USD'] = [np.nan if pd.isna(nilval) else float(nilval[1:-1])*1000000 if nilval[-1]=='M' else float(nilval[1:-1])*1000 if nilval[-1]=='K' else float(nilval[1:-1]) for nilval in df['NILVAL']]
df['INSTA_LONG'] = [np.nan if (pd.isna(insta) or insta == '-') else float(insta[0:-1])*1000000 if insta[-1]=='M' else float(insta[0:-1])*1000 if insta[-1]=='K' else float(insta) for insta in df['INSTA']]
df['TWIT_LONG'] = [np.nan if (pd.isna(twit) or twit == '-') else float(twit[0:-1])*1000000 if twit[-1]=='M' else float(twit[0:-1])*1000 if twit[-1]=='K' else float(twit) for twit in df['TWIT']]
df['TIK_LONG'] = [np.nan if (pd.isna(tik) or tik == '-') else float(tik[0:-1])*1000000 if tik[-1]=='M' else float(tik[0:-1])*1000 if tik[-1]=='K' else float(tik) for tik in df['TIK']]


df.head(5)


Unnamed: 0,NAME,EXP,POS_HEI_WEI,GRADE,AGE,SKILL,HISCH,HOTOWN,STARCOLL,COLLDIST,...,NILVAL,POS,HEIGHT_IN,WEIGHT_LBS,SKILL_,COLLDIST_MI,NILVAL_LONG_USD,INSTA_LONG,TWIT_LONG,TIK_LONG
0,Walter Nolen,2022 - present,DL / 6-4 / 345,Sophomore,19,99.59+,Powell,"Powell, TN",Texas A&M,803 mi.,...,$85K,DL,76.0,345.0,99.59,803.0,85000.0,21000.0,11100.0,515.0
1,Travis Hunter,2023 - present,CB / 6-1 / 165,Sophomore,20,99.56+,Collins Hill,"Suwanee, GA",Jackson State,375 mi.,...,$2.2M,CB,73.0,165.0,99.56,375.0,2200000.0,997000.0,56000.0,713000.0
2,Luther Burden,2022 - present,WR / 5-11.5 / 198,Sophomore,19,98.80+,East St. Louis,"Saint Louis, MO",Missouri,111 mi.,...,$429K,WR,71.5,198.0,98.8,111.0,429000.0,46000.0,13200.0,10300.0
3,Sonny Styles,2022 - present,ATH / 6-4 / 205,Sophomore,18,98.73+,Pickerington Central,"Pickerington, OH",Ohio State,20 mi.,...,$114K,ATH,76.0,205.0,98.73,20.0,114000.0,25000.0,9100.0,
4,Domani Jackson,2022 - present,CB / 6-1 / 181,Sophomore,20,98.57+,Mater Dei,"Santa Ana, CA",USC,27 mi.,...,$88K,CB,73.0,181.0,98.57,27.0,88000.0,29000.0,6100.0,375.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118 entries, 0 to 2117
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   NAME             2118 non-null   object 
 1   EXP              1609 non-null   object 
 2   POS_HEI_WEI      2118 non-null   object 
 3   GRADE            1609 non-null   object 
 4   AGE              1709 non-null   object 
 5   SKILL            2115 non-null   object 
 6   HISCH            1816 non-null   object 
 7   HOTOWN           1816 non-null   object 
 8   STARCOLL         1937 non-null   object 
 9   COLLDIST         1913 non-null   object 
 10  NUMOFF           1913 non-null   float64
 11  INSTA            1094 non-null   object 
 12  TWIT             1352 non-null   object 
 13  TIK              590 non-null    object 
 14  NILVAL           283 non-null    object 
 15  POS              2118 non-null   object 
 16  HEIGHT_IN        2096 non-null   float64
 17  WEIGHT_LBS    

In [7]:
print(df['AGE'].unique())

['19' '20' '18' '21' nan '-' '1819']
