In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
df = pd.read_csv('../data/alabama_recruiting.csv')

In [3]:
df.head()

Unnamed: 0,id,athleteId,recruitType,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,country,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
0,8263,,HighSchool,2006,1,Andre Smith,Huffman,Alabama,OG,76.0,325,5,0.9993,Birmingham,AL,USA,33.520682,-86.802433,1073
1,19572,-1000934.0,HighSchool,2011,2,Cyrus Kouandjio,DeMatha Catholic,Alabama,OT,79.0,315,5,0.9993,Hyattsville,MD,USA,38.952944,-76.940865,24033
2,71459,4431437.0,HighSchool,2021,2,JC Latham,IMG Academy,Alabama,OT,78.0,305,5,0.999,Bradenton,FL,USA,27.498928,-82.574819,12081
3,38628,4241457.0,HighSchool,2017,2,Najee Harris,Antioch,Alabama,RB,75.0,226,5,0.9984,Antioch,CA,USA,38.004921,-121.805789,6013
4,65896,,HighSchool,2001,2,Brodie Croyle,Westbrook Christian School,Alabama,PRO,74.0,185,5,0.9986,Rainbow City,AL,USA,33.897166,-86.10254,1055


In [4]:
df.position.unique()

array(['OG', 'OT', 'RB', 'PRO', 'DUAL', 'WR', 'WDE', 'SDE', 'S', 'ILB',
       'CB', 'EDGE', 'ATH', 'OLB', 'DL', 'TE', 'DT', 'QB', 'APB', 'LB',
       'OC', 'FB', 'IOL', 'K', 'P', 'LS'], dtype=object)

In [5]:
position_map ={
    'QB':'QB',
    'DUAL':'QB',
    'PRO':'QB',
    'RB':'RB',
    'APB':'RB',
    'FB':'RB',
    'WR':'WR',
    'TE':'TE',
    'IOL':'IOL',
    'OG':'IOL',
    'OC':'IOL',
    'OT':'OT',
    'EDGE':'EDGE',
    'SDE':'EDGE',
    'WDE':'EDGE',
    'DT':'IDL',
    'DL':'IDL',
    'LB':'ILB',
    'ILB':'ILB',
    'OLB':'OLB',
    'CB':'CB',
    'S':'S',
    'ATH':'ATH',
    'K':'K',
    'LS':'LS',
    'P':'P'
}

In [6]:
set(df.position.unique()) ^ set(position_map.keys())

set()

In [7]:
df['position_simple'] = df['position'].apply(lambda x: position_map[x])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      520 non-null    int64  
 1   athleteId               319 non-null    float64
 2   recruitType             520 non-null    object 
 3   year                    520 non-null    int64  
 4   ranking                 520 non-null    int64  
 5   name                    520 non-null    object 
 6   school                  520 non-null    object 
 7   committedTo             520 non-null    object 
 8   position                520 non-null    object 
 9   height                  520 non-null    float64
 10  weight                  520 non-null    int64  
 11  stars                   520 non-null    int64  
 12  rating                  520 non-null    float64
 13  city                    520 non-null    object 
 14  stateProvince           520 non-null    ob

In [9]:
len(df.id.unique())

520

In [10]:
dupe_counts = df.groupby(['name', 'school'])['id'].count().reset_index(name='count_dupes')

In [11]:
df = pd.merge(df, dupe_counts, how = 'left', on=['name', 'school'])

In [12]:
df_dupes = df[df['count_dupes']==2].drop_duplicates(subset=['name', 'school'], keep='first')

In [13]:
df = pd.concat([df[df['count_dupes']==1], df_dupes])

In [14]:
side = {'offense': ['QB', 'RB', 'WR', 'TE', 'OT', 'IOL'],
'defense': ['ILB', 'OLB', 'CB', 'S', 'EDGE', 'IDL'],
'special_teams_or_athlete':['K', 'P', 'LS', 'ATH']}
def classify_side(simple_position, side):
    for unit in side.keys():
        if simple_position in side[unit]:
            return unit

In [15]:
df['side'] = df['position_simple'].apply(lambda x: classify_side(x, side))

In [16]:
composition = pd.pivot_table(df, index=['position_simple', 'side'], columns=['year'], 
                             values = 'id', aggfunc='count', fill_value=0).reset_index()

In [17]:
composition_score = pd.pivot_table(df, index=['position_simple', 'side'], columns=['year'], 
                             values = 'rating', aggfunc='sum').reset_index()

In [18]:
composition

year,position_simple,side,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ATH,special_teams_or_athlete,0,0,0,3,2,2,1,0,...,2,1,2,1,0,0,2,1,4,1
1,CB,defense,0,0,2,1,1,1,3,1,...,2,2,4,1,4,4,1,2,3,2
2,EDGE,defense,4,0,0,1,1,5,1,2,...,1,3,2,2,4,5,2,3,2,3
3,IDL,defense,1,0,1,0,2,0,0,3,...,4,2,3,1,1,3,3,3,4,4
4,ILB,defense,0,0,2,0,0,2,3,2,...,1,0,2,1,2,2,2,2,1,0
5,IOL,offense,0,0,0,1,2,5,3,3,...,3,4,2,2,2,2,1,3,0,1
6,K,special_teams_or_athlete,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
7,LS,special_teams_or_athlete,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,OLB,defense,1,0,1,1,1,1,2,2,...,3,3,1,3,0,0,1,1,0,0
9,OT,offense,1,0,0,2,0,0,2,0,...,2,1,2,2,0,3,2,2,3,4


In [19]:
def get_comp_pct_by_class(composition):
    composition_pct = composition.copy()
    class_size = list(composition_pct.sum(axis=0))
    for i, col in enumerate(composition_pct.columns):
        if i <= 1:
            continue

        composition_pct[col] = np.round((composition_pct[col]/class_size[i])*100).astype('int')
    return composition_pct

In [20]:
def get_total_comp_pct(composition):
    composition_pct = composition.copy()
    class_sizes = np.sum(list(composition_pct.sum(axis=0))[2:])
    composition_pct['total'] = 0
    for i, col in enumerate(composition_pct.columns):
        if i <= 1 or col == 'total':
            continue

        composition_pct['total'] = composition_pct['total'] + composition_pct[col]
    composition_pct['total_pct'] = np.round((composition_pct['total'] / class_sizes)*100).astype('int')
    return composition_pct

## Nick Saban

In [21]:
saban = composition[['position_simple', 'side', 2021, 2022, 2023]]
saban = get_total_comp_pct(saban)

In [22]:
def classify_order_side(simple_position, side):
    for unit in side.keys():
        if simple_position in side[unit]:
            return side[unit].index(simple_position)
        
saban['order_side'] = saban['position_simple'].apply(lambda x: classify_order_side(x, side))

In [27]:
fig = px.line_polar(saban[saban['side'].isin(['offense'])].sort_values(by='order_side'), 
                    r='total_pct', 
                    theta='position_simple', 
                    line_close=True,
                    range_r = [0, max(saban[saban['side'].isin(['defense', 'offense'])]['total_pct'])+1],
                   color_discrete_sequence=['#9e1b32']
                   )

fig.update_traces(fill='toself')
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    'autosize':False,
    'width':650,
    'height':650,
    'font_size':24,
    'font_family':'Helvetica'
})
fig.write_image('../images/alabama_recruiting/saban_offense.png')
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [28]:
fig = px.line_polar(saban[saban['side'].isin(['defense'])].sort_values(by='order_side'), 
                    r='total_pct', 
                    theta='position_simple', 
                    line_close=True,
                    range_r = [0, max(saban[saban['side'].isin(['defense', 'offense'])]['total_pct'])+1],
                   color_discrete_sequence=['#9e1b32']
                   )

fig.update_traces(fill='toself')
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    'autosize':False,
    'width':650,
    'height':650,
    'font_size':24,
    'font_family':'Helvetica'
})
fig.write_image('../images/alabama_recruiting/saban_defense.png')
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [25]:
saban[saban['side'].isin(['special_teams_or_athlete'])]['total_pct'].sum()

9