In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
df = pd.read_csv('../data/georgia_recruiting.csv')

In [3]:
df.head()

Unnamed: 0,id,athleteId,recruitType,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,country,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
0,31860,,HighSchool,2015,1,Trent Thompson,Westover,Georgia,DT,74.5,313,5,0.9991,Albany,GA,USA,31.578206,-84.155681,13095.0
1,66928,3915192.0,HighSchool,2015,1,Trenton Thompson,Westover,Georgia,DT,74.0,313,5,0.9992,Albany,GA,USA,31.578206,-84.155681,13095.0
2,46797,4426331.0,HighSchool,2019,1,Nolan Smith,IMG Academy,Georgia,WDE,75.0,227,5,0.9994,Bradenton,FL,USA,27.498928,-82.574819,12081.0
3,42909,4362887.0,HighSchool,2018,2,Justin Fields,Harrison,Georgia,DUAL,75.0,221,5,0.9998,Kennesaw,GA,USA,34.023434,-84.61549,13067.0
4,61572,4428992.0,HighSchool,2020,4,Kelee Ringo,Saguaro,Georgia,CB,74.0,205,5,0.9976,Scottsdale,AZ,USA,33.494219,-111.926018,4013.0


In [4]:
df.position.unique()

array(['DT', 'WDE', 'DUAL', 'CB', 'DL', 'PRO', 'RB', 'OT', 'WR', 'OG',
       'ATH', 'APB', 'EDGE', 'TE', 'S', 'SDE', 'OLB', 'ILB', 'OC', 'IOL',
       'LB', 'QB', 'FB', 'K', 'P', 'LS'], dtype=object)

In [5]:
position_map ={
    'QB':'QB',
    'DUAL':'QB',
    'PRO':'QB',
    'RB':'RB',
    'APB':'RB',
    'FB':'RB',
    'WR':'WR',
    'TE':'TE',
    'IOL':'IOL',
    'OG':'IOL',
    'OC':'IOL',
    'OT':'OT',
    'EDGE':'EDGE',
    'SDE':'EDGE',
    'WDE':'EDGE',
    'DT':'IDL',
    'DL':'IDL',
    'LB':'ILB',
    'ILB':'ILB',
    'OLB':'OLB',
    'CB':'CB',
    'S':'S',
    'ATH':'ATH',
    'K':'K',
    'LS':'LS',
    'P':'P'
}

In [6]:
set(df.position.unique()) ^ set(position_map.keys())

set()

In [7]:
df['position_simple'] = df['position'].apply(lambda x: position_map[x])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      517 non-null    int64  
 1   athleteId               321 non-null    float64
 2   recruitType             517 non-null    object 
 3   year                    517 non-null    int64  
 4   ranking                 517 non-null    int64  
 5   name                    517 non-null    object 
 6   school                  517 non-null    object 
 7   committedTo             517 non-null    object 
 8   position                517 non-null    object 
 9   height                  517 non-null    float64
 10  weight                  517 non-null    int64  
 11  stars                   517 non-null    int64  
 12  rating                  517 non-null    float64
 13  city                    517 non-null    object 
 14  stateProvince           517 non-null    ob

In [9]:
len(df.id.unique())

517

In [10]:
dupe_counts = df.groupby(['name', 'school'])['id'].count().reset_index(name='count_dupes')

In [11]:
df = pd.merge(df, dupe_counts, how = 'left', on=['name', 'school'])

In [12]:
df_dupes = df[df['count_dupes']==2].drop_duplicates(subset=['name', 'school'], keep='first')

In [13]:
df = pd.concat([df[df['count_dupes']==1], df_dupes])

In [14]:
side = {'offense': ['QB', 'RB', 'WR', 'TE', 'OT', 'IOL'],
'defense': ['ILB', 'OLB', 'CB', 'S', 'EDGE', 'IDL'],
'special_teams_or_athlete':['K', 'P', 'LS', 'ATH']}
def classify_side(simple_position, side):
    for unit in side.keys():
        if simple_position in side[unit]:
            return unit

In [15]:
df['side'] = df['position_simple'].apply(lambda x: classify_side(x, side))

In [16]:
composition = pd.pivot_table(df, index=['position_simple', 'side'], columns=['year'], 
                             values = 'id', aggfunc='count', fill_value=0).reset_index()

In [17]:
composition_score = pd.pivot_table(df, index=['position_simple', 'side'], columns=['year'], 
                             values = 'rating', aggfunc='sum').reset_index()

In [18]:
composition

year,position_simple,side,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ATH,special_teams_or_athlete,0,0,3,0,4,0,0,0,...,0,2,2,0,0,1,1,1,1,0
1,CB,defense,0,0,3,1,0,2,2,1,...,2,3,2,5,4,1,3,3,3,5
2,EDGE,defense,0,0,4,1,4,2,3,2,...,3,5,2,2,2,3,0,1,3,3
3,IDL,defense,0,0,1,0,0,3,1,0,...,1,4,3,1,2,2,4,2,4,2
4,ILB,defense,0,0,0,3,1,1,1,1,...,1,1,1,2,1,2,0,1,3,3
5,IOL,offense,0,0,2,4,0,1,4,2,...,2,1,0,2,3,1,1,2,2,2
6,K,special_teams_or_athlete,0,0,0,1,0,0,0,0,...,0,1,1,1,0,0,1,0,0,1
7,LS,special_teams_or_athlete,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,OLB,defense,0,1,1,3,1,2,2,2,...,0,2,0,2,3,1,2,3,0,0
9,OT,offense,0,0,4,3,2,0,4,2,...,2,3,3,2,3,2,5,2,3,3


In [19]:
def get_comp_pct_by_class(composition):
    composition_pct = composition.copy()
    class_size = list(composition_pct.sum(axis=0))
    for i, col in enumerate(composition_pct.columns):
        if i <= 1:
            continue

        composition_pct[col] = np.round((composition_pct[col]/class_size[i])*100).astype('int')
    return composition_pct

In [20]:
def get_total_comp_pct(composition):
    composition_pct = composition.copy()
    class_sizes = np.sum(list(composition_pct.sum(axis=0))[2:])
    composition_pct['total'] = 0
    for i, col in enumerate(composition_pct.columns):
        if i <= 1 or col == 'total':
            continue

        composition_pct['total'] = composition_pct['total'] + composition_pct[col]
    composition_pct['total_pct'] = np.round((composition_pct['total'] / class_sizes)*100).astype('int')
    return composition_pct

## Kirby Smart

In [21]:
kirby = composition[['position_simple', 'side', 2021, 2022, 2023]]
kirby = get_total_comp_pct(kirby)

In [22]:
def classify_order_side(simple_position, side):
    for unit in side.keys():
        if simple_position in side[unit]:
            return side[unit].index(simple_position)
        
kirby['order_side'] = kirby['position_simple'].apply(lambda x: classify_order_side(x, side))

In [29]:
fig = px.line_polar(kirby[kirby['side'].isin(['offense'])].sort_values(by='order_side'), 
                    r='total_pct', 
                    theta='position_simple', 
                    line_close=True,
                    range_r = [0, max(kirby[kirby['side'].isin(['defense', 'offense'])]['total_pct'])+1],
                   color_discrete_sequence=['#BA0C2F']
                   )

fig.update_traces(fill='toself')
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    'autosize':False,
    'width':650,
    'height':650,
    'font_size':24,
    'font_family':'Helvetica'
})
fig.write_image('../images/georgia_recruiting/kirby_offense.png')
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [30]:
fig = px.line_polar(kirby[kirby['side'].isin(['defense'])].sort_values(by='order_side'), 
                    r='total_pct', 
                    theta='position_simple', 
                    line_close=True,
                    range_r = [0, max(kirby[kirby['side'].isin(['defense', 'offense'])]['total_pct'])+1],
                   color_discrete_sequence=['#BA0C2F']
                   )

fig.update_traces(fill='toself')
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    'autosize':False,
    'width':650,
    'height':650,
    'font_size':24,
    'font_family':'Helvetica'
})
fig.write_image('../images/georgia_recruiting/kirby_defense.png')
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [28]:
kirby[kirby['side'].isin(['special_teams_or_athlete'])]['total_pct'].sum()

5