# Adding combine data to college data
We've iterated through this a few times before realizing that there are a few quarterbacks who went to the NFL Scouting Combine who don't show up in the college data we're using from ESPN. Those QBs didn't play at FBS schools, but some of them had productive careers in the NFL and some of them flopped. So we'll add them into the dataset.

In [68]:
import pandas as pd
import numpy as np

In [69]:
main_df = pd.read_csv('dataset_pre_combine.csv')

In [70]:
main_df.head()

Unnamed: 0,Name,Power 5_3,Power 5_4,QBR_3,QBR_4,PAA_3,PAA_4,PLAYS_3,PLAYS_4,EPA_3,...,RAW_4,QBR_Diff,PAA_Diff,PLAYS_Diff,EPA_Diff,PASS_Diff,RUN_Diff,SACK_Diff,PEN_Diff,RAW_Diff
0,A.J. Doyle,0.0,0.0,21.292818,22.0,-26.170233,-32.7,317.964741,321.0,-12.825698,...,25.8,0.707182,-6.529767,3.035259,-1.474302,-0.122754,0.072422,0.091483,0.043697,0.814387
1,A.J. Erdely,0.0,0.0,49.5,46.1,24.4,7.8,486.0,293.0,40.1,...,56.9,-3.4,-16.6,-193.0,-21.8,-16.1,-14.2,8.5,-0.1,-5.9
2,AJ Bush Jr.,1.0,1.0,51.005977,52.7,0.320125,0.4,372.444681,376.0,12.556627,...,50.3,1.694023,0.079875,3.555319,1.443373,0.438406,-0.174606,0.140083,0.067973,1.58774
3,AJ Mayer,0.0,0.0,57.00668,58.9,3.201252,4.0,60.423206,61.0,5.471102,...,66.6,1.89332,0.798748,0.576794,0.628898,0.508551,-0.017857,0.038594,0.063117,2.102256
4,AJ McCarron,1.0,1.0,84.9,82.5,47.3,43.2,387.0,404.0,49.6,...,75.2,-2.4,-4.1,17.0,0.8,-1.8,0.8,2.9,-1.1,-2.9


# Bringing in combine data
We'll add heights, weights, and 40-yard dash times measured at the NFL Scouting Combine for each QB.<br>
Not all QBs are invited to the combine. We'll need to take a look that the players who aren't in the combine data. If any of them played more than a game or two in the NFL, we'll need to keep them in the main dataset and manually fill in heights and weights and then probably fill in missing 40-yard dash value with the mean.<br>
We'll thin out the herd this way. Most of the QBs who played in college weren't invited to the combine and were never a candidate for the NFL and they would have flooded the dataset with low values for the dependent variable.

In [71]:
import nfl_data_py as nfl

In [72]:
years = list(np.arange(2004, 2026))

In [73]:
combine_df = nfl.import_combine_data(years, ['QB'])

In [74]:
combine_names = list(combine_df['player_name'])

In [75]:
main_names = list(main_df['Name'])

In [76]:
len(combine_names), len(main_names)

(395, 1124)

In [77]:
combine_df.head()

Unnamed: 0,season,draft_year,draft_team,draft_round,draft_ovr,pfr_id,cfb_id,player_name,pos,school,ht,wt,forty,bench,vertical,broad_jump,cone,shuttle
1328,2004,,,,,,casey-clausen-1,Casey Clausen,QB,Tennessee,6-3,223.0,4.95,,,,,
1368,2004,,,,,,jason-fife-1,Jason Fife,QB,Oregon,6-4,226.0,4.83,,30.5,113.0,7.28,4.18
1394,2004,2004.0,Baltimore Ravens,6.0,187.0,,josh-harris-2,Josh Harris,QB,Bowling Green,6-1,238.0,4.78,,31.0,115.0,,
1429,2004,,,,,,,Robert Kent,QB,Jackson State,6-4,222.0,4.87,,26.5,110.0,7.69,4.22
1435,2004,2004.0,Chicago Bears,5.0,148.0,KrenCr00,craig-krenzel-1,Craig Krenzel,QB,Ohio State,6-3,228.0,4.84,,28.0,112.0,7.22,4.08


In [78]:
main_not_in_combine = list(set(main_names).difference(set(combine_names)))
combine_not_in_main = list(set(combine_names).difference(set(main_names)))

In [79]:
len(main_not_in_combine), len(combine_not_in_main)

(825, 96)

In [80]:
combine_not_in_main

['E.J. Manuel',
 'Jaxson Dart',
 'Chase Holbrook',
 'Shedeur Sanders',
 'Josh Harris',
 'A.J. McCarron',
 'Dillon Gabriel',
 'Ingle Martin',
 'Tom Brandstater',
 'EJ Perry',
 'Josh Woodrum',
 'Ryan Fitzpatrick',
 'Kyle Lauletta',
 'Ell Roberson',
 'Matt Gutierrez',
 'Andre Woodson',
 'Casey Clausen',
 'Seth Henigan',
 'Quinn Ewers',
 'Joe Flacco',
 'Max Brosmer',
 'Nathan Enderle',
 'Adrian McPherson',
 'Luke McCown',
 'Cam Ward',
 'Matt Mauck',
 'Jared Allen',
 'Tarvaris Jackson',
 'David Moore',
 'Brady Davis',
 'Eli Manning',
 'Jalen Milroe',
 'Josh Portis',
 'Dustin Vaughan',
 'B.J. Coleman',
 'Rod Rutherford',
 'J.P. Losman',
 'Jason Fife',
 'Trey Lance',
 'Ryan Colburn',
 'Kyle McCord',
 'Ben Roethlisberger',
 'Easton Stick',
 'Bruce Eugene',
 'Pat Devlin',
 'Josh Haldi',
 'Kevin Davidson',
 'Erik Meyer',
 'Aaron Corp',
 'Brad Sorensen',
 'Cody Pickett',
 'Barrick Nealy',
 'Ben Dougherty',
 'Colby Cameron',
 'Kurtis Rourke',
 'Travis Lulay',
 'Tyler Shough',
 'Nathan Brown',
 'Br

In [81]:
combine_names_to_keep = ['Joe Flacco', 'Ryan Fitzpatrick', 'Luke McCown', 'Tarvaris Jackson', 'Trey Lance', 'Easton Stick',\
                         'John Skelton', 'Colin Kaepernick', 'Colt Brennan', 'Jimmy Garoppolo', 'Josh Johnson', 'Carson Wentz']

In [82]:
names_to_keep = ['AJ McCarron', 'EJ Manuel', 'Nick Mullens', 'Trevor Siemian', 'Kyle Allen', 'Taysom Hill', 'Tyler Huntley',\
                 'Matt Moore', 'David Blough', 'Terrelle Pryor Sr.', 'Taylor Heinicke', 'John Wolford', 'PJ Walker', 'Tommy DeVito']

In [83]:
main_not_in_combine = set(main_not_in_combine)
names_to_keep = set(names_to_keep)

In [84]:
drop_players = main_not_in_combine.difference(names_to_keep)

In [85]:
len(drop_players)

811

In [86]:
main_df = main_df[~main_df['Name'].isin(drop_players)]

In [87]:
main_df['Name'].nunique()

313

In [88]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313 entries, 4 to 1119
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        313 non-null    object 
 1   Power 5_3   313 non-null    float64
 2   Power 5_4   313 non-null    float64
 3   QBR_3       313 non-null    float64
 4   QBR_4       313 non-null    float64
 5   PAA_3       313 non-null    float64
 6   PAA_4       313 non-null    float64
 7   PLAYS_3     313 non-null    float64
 8   PLAYS_4     313 non-null    float64
 9   EPA_3       313 non-null    float64
 10  EPA_4       313 non-null    float64
 11  PASS_3      313 non-null    float64
 12  PASS_4      313 non-null    float64
 13  RUN_3       313 non-null    float64
 14  RUN_4       313 non-null    float64
 15  SACK_3      313 non-null    float64
 16  SACK_4      313 non-null    float64
 17  PEN_3       313 non-null    float64
 18  PEN_4       313 non-null    float64
 19  RAW_3       313 non-null    float

In [89]:
main_df['Name'] = main_df['Name'].replace({'AJ McCarron': 'A.J. McCarron', 'EJ Manuel': 'E.J. Manuel'})

In [90]:
combine_df = combine_df[['season', 'player_name', 'ht', 'wt', 'forty', 'vertical', 'broad_jump', 'cone', 'shuttle']]

In [91]:
combine_df = combine_df.rename(columns = {'season': 'Season', 'player_name': 'Name'})

# Adding non-FBS QBs who participated in combine
Here is where we need to append these rows to the main dataframe. We'll fill empty college stats with means.

In [92]:
main_df_cols = list(main_df.columns)

In [93]:
append_df = pd.DataFrame()

In [94]:
append_df['Name'] = combine_names_to_keep

In [95]:
main_df = pd.concat([main_df, append_df])

In [96]:
main_df.head()

Unnamed: 0,Name,Power 5_3,Power 5_4,QBR_3,QBR_4,PAA_3,PAA_4,PLAYS_3,PLAYS_4,EPA_3,...,RAW_4,QBR_Diff,PAA_Diff,PLAYS_Diff,EPA_Diff,PASS_Diff,RUN_Diff,SACK_Diff,PEN_Diff,RAW_Diff
4,A.J. McCarron,1.0,1.0,84.9,82.5,47.3,43.2,387.0,404.0,49.6,...,75.2,-2.4,-4.1,17.0,0.8,-1.8,0.8,2.9,-1.1,-2.9
6,Aaron Murray,1.0,1.0,80.9,88.1,51.1,59.1,503.0,443.0,61.3,...,80.0,7.2,8.0,-60.0,9.5,-10.1,11.7,4.8,3.6,5.8
9,Aaron Rodgers,1.0,1.0,72.685937,75.1,23.609231,29.5,418.009721,422.0,37.669882,...,67.4,2.414063,5.890769,3.990279,4.330118,4.340223,-0.063493,0.202977,0.019421,2.127508
17,Aidan O'Connell,1.0,1.0,85.7,71.4,65.7,24.1,496.0,576.0,77.3,...,60.7,-14.3,-41.6,80.0,-31.8,-32.2,-2.1,0.9,1.8,-19.1
20,Alex Brink,1.0,1.0,63.7,70.5,6.5,24.0,506.0,609.0,26.9,...,60.1,6.8,17.5,103.0,17.3,-5.7,14.3,5.8,2.9,6.8


In [97]:
main_df = pd.merge(main_df, combine_df, on='Name', how='left')

In [98]:
def convert_height(height_str):
    try:
        # If height_str is already a float or NaN, return None
        if pd.isnull(height_str):
            return None
        feet, inches = height_str.split('-')
        return int(feet) * 12 + int(inches)
    except:
        return None  # fallback if formatting is off

In [99]:
main_df['ht'] = main_df['ht'].apply(convert_height)

In [100]:
missing_combine = main_df[main_df['Season'].isna()]

In [101]:
missing_combine_names = list(missing_combine['Name'])

In [102]:
missing_combine_names

['David Blough',
 'John Wolford',
 'Kyle Allen',
 'Matt Moore',
 'Nick Mullens',
 'PJ Walker',
 'Taylor Heinicke',
 'Taysom Hill',
 'Terrelle Pryor Sr.',
 'Tommy DeVito',
 'Trevor Siemian',
 'Tyler Huntley']

In [103]:
missing_combine_seasons = [2019, 2020, 2018, 2007, 2018, 2020, 2017, 2017, 2011, 2023, 2015, 2020]
missing_combine_heights = [73, 73, 75, 75, 72, 71, 73, 74, 76, 74, 75, 73]
missing_combine_weights = [207, 218, 210, 219, 210, 215, 210, 221, 228, 210, 220, 205]

In [104]:
fill_lists = []
for i in range(len(missing_combine_names)):
    temp_list = []
    temp_list.append(missing_combine_seasons[i])
    temp_list.append(missing_combine_heights[i])
    temp_list.append(missing_combine_weights[i])
    fill_lists.append(temp_list)

In [105]:
fill_dict = dict(zip(missing_combine_names, fill_lists))

In [106]:
fill_dict

{'David Blough': [2019, 73, 207],
 'John Wolford': [2020, 73, 218],
 'Kyle Allen': [2018, 75, 210],
 'Matt Moore': [2007, 75, 219],
 'Nick Mullens': [2018, 72, 210],
 'PJ Walker': [2020, 71, 215],
 'Taylor Heinicke': [2017, 73, 210],
 'Taysom Hill': [2017, 74, 221],
 'Terrelle Pryor Sr.': [2011, 76, 228],
 'Tommy DeVito': [2023, 74, 210],
 'Trevor Siemian': [2015, 75, 220],
 'Tyler Huntley': [2020, 73, 205]}

In [107]:
for k, v in fill_dict.items():
    main_df.loc[main_df['Name'] == k, 'Season'] = v[0]
    main_df.loc[main_df['Name'] == k, 'ht'] = v[1]
    main_df.loc[main_df['Name'] == k, 'wt'] = v[2]

In [108]:
main_df = main_df.fillna(main_df.mean(numeric_only=True))

In [109]:
main_df['Season'] = main_df['Season'].astype('int')

In [110]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 38 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        325 non-null    object 
 1   Power 5_3   325 non-null    float64
 2   Power 5_4   325 non-null    float64
 3   QBR_3       325 non-null    float64
 4   QBR_4       325 non-null    float64
 5   PAA_3       325 non-null    float64
 6   PAA_4       325 non-null    float64
 7   PLAYS_3     325 non-null    float64
 8   PLAYS_4     325 non-null    float64
 9   EPA_3       325 non-null    float64
 10  EPA_4       325 non-null    float64
 11  PASS_3      325 non-null    float64
 12  PASS_4      325 non-null    float64
 13  RUN_3       325 non-null    float64
 14  RUN_4       325 non-null    float64
 15  SACK_3      325 non-null    float64
 16  SACK_4      325 non-null    float64
 17  PEN_3       325 non-null    float64
 18  PEN_4       325 non-null    float64
 19  RAW_3       325 non-null    f

In [111]:
main_df.to_csv('dataset_combine_nums.csv', index = False)