In [798]:
import pandas as pd
import numpy as np
import nfl_data_py as nfl

In [799]:
main_df = pd.read_csv('dataset_combine_nums.csv')

# Target variable

There are countless ways to measure the value of a quarterback over the course of his career. We can use the nfl_data_py play-by-play data to come up with Win Probability Added for each QB. This is a total of the how much the QB improves the win probability of his team with each play.

In [800]:
pbp_cols = ['season', 'week', 'season_type','home_team', 'away_team', 'posteam', 'qb_dropback', 'wpa', 'passer_id', 'passer', 'passer_player_name', 'name', 'pass', 'result', 'desc']

In [801]:
pbp_data_11 = pd.read_csv('play_by_play_to_11.csv')

In [802]:
pbp_data_17 = pd.read_csv('play_by_play_to_17.csv')

In [803]:
pbp_data_24 = pd.read_csv('play_by_play_to_24.csv')

In [804]:
pbp_data = pd.concat([pbp_data_11, pbp_data_17, pbp_data_24])

In [805]:
pbp_data['season'].unique()

array([2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], dtype=int64)

# Filtering for QB dropbacks
Here we filter the play-by-play data to include only plays in which the QB drops back, either to pass or run with the ball.

In [806]:
pbp_data_qb = pbp_data[pbp_data['qb_dropback'] == 1]
pbp_data_qb = pbp_data_qb.groupby(['passer_id', 'name', 'season'])['wpa'].sum().reset_index()

In [807]:
pbp_data_qb = pbp_data_qb.sort_values(by = 'wpa', ascending = False)

In [808]:
pbp_data_qb.head()

Unnamed: 0,passer_id,name,season,wpa
1733,00-0033873,P.Mahomes,2020,7.348525
1735,00-0033873,P.Mahomes,2022,7.139297
117,00-0010346,P.Manning,2006,6.454853
120,00-0010346,P.Manning,2009,6.199815
1732,00-0033873,P.Mahomes,2019,5.92229


In [809]:
pbp_data_qb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2113 entries, 1733 to 617
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   passer_id  2113 non-null   object 
 1   name       2113 non-null   object 
 2   season     2113 non-null   int64  
 3   wpa        2113 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 82.5+ KB


In [810]:
pbp_data_qb[pbp_data_qb['name'] == 'R.Griffin III']

Unnamed: 0,passer_id,name,season,wpa
1303,00-0029665,R.Griffin III,2012,2.068793
1306,00-0029665,R.Griffin III,2016,1.077459
1345,00-0029857,R.Griffin III,2019,-0.005721
1308,00-0029665,R.Griffin III,2019,-0.05259
1307,00-0029665,R.Griffin III,2018,-0.169395
1304,00-0029665,R.Griffin III,2013,-0.403004
1309,00-0029665,R.Griffin III,2020,-0.455365
1305,00-0029665,R.Griffin III,2014,-0.802814


In [811]:
pbp_data_qb = pbp_data_qb.drop(columns = 'passer_id')

In [812]:
qb_summary = pbp_data_qb.groupby(['name']).agg(
    total_wpa=('wpa', 'sum'),
    seasons_played=('season', 'nunique')
).reset_index()


In [813]:
qb_summary = qb_summary.sort_values(by='total_wpa', ascending=False)


In [814]:
qb_summary = qb_summary.reset_index()

In [815]:
qb_summary['wpa_per_season'] = qb_summary['total_wpa']/qb_summary['seasons_played']

In [816]:
qb_summary.head()

Unnamed: 0,index,name,total_wpa,seasons_played,wpa_per_season
0,581,T.Brady,49.298219,18,2.73879
1,167,D.Brees,44.183841,16,2.76149
2,27,A.Rodgers,41.700951,20,2.085048
3,495,P.Mahomes,39.447933,8,4.930992
4,496,P.Manning,35.182539,10,3.518254


In [817]:
# # Calculating target variable
# We'll scale total_wpa and wpa_per_season, then average them with wpa_per_season getting 60 percent of the weight so that QBs like Patrick Mahomes and Jayden Daniels don't get penalized for not having their careers finished.<br>

# The name of the target variable will be 'QB_score'

In [818]:
# qb_summary.sort_values(by = 'QB_score', ascending = False)

In [819]:
qb_summary[qb_summary['name'] == 'R.Griffin III']

Unnamed: 0,index,name,total_wpa,seasons_played,wpa_per_season
52,520,R.Griffin III,1.257364,7,0.179623


In [820]:
# qb_summary = qb_summary[['name', 'QB_score']]

In [821]:
suffix_names = {'G.Minshew II': 'G.Minshew', 'R.Griffin III': 'R.Griffin', 'M.Penix': 'M.Penix Jr.', 'T.Pryor': 'T.Pryor Sr.'}

In [822]:
qb_summary['name'] = qb_summary['name'].replace(suffix_names)

# Name matching
We have more data that we'll use to factor into our target variable. But for now we have all we need from the play-by-play data. So we'll match the names with the main dataframe.<br>

We can get around fuzzy matching by creating a new column in the main_df that formatted as first name followed by a period then by the last name with no spaces, which is the way it's formatted in the target variable dataset.

In [823]:
def name_to_initial_format(name):
    try:
        first, last = name.strip().split(' ', 1)
        return f"{first[0]}.{last}"
    except:
        return None

main_df['name'] = main_df['Name'].apply(name_to_initial_format)

In [824]:
main_df[['Name', 'name']].head()

Unnamed: 0,Name,name
0,A.J. McCarron,A.McCarron
1,Aaron Murray,A.Murray
2,Aaron Rodgers,A.Rodgers
3,Aidan O'Connell,A.O'Connell
4,Alex Brink,A.Brink


In [825]:
qb_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 642 entries, 0 to 641
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           642 non-null    int64  
 1   name            642 non-null    object 
 2   total_wpa       642 non-null    float64
 3   seasons_played  642 non-null    int64  
 4   wpa_per_season  642 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 25.2+ KB


In [826]:
merged_df = main_df.merge(qb_summary, on='name', how='left')

In [827]:
qb_summary.head()

Unnamed: 0,index,name,total_wpa,seasons_played,wpa_per_season
0,581,T.Brady,49.298219,18,2.73879
1,167,D.Brees,44.183841,16,2.76149
2,27,A.Rodgers,41.700951,20,2.085048
3,495,P.Mahomes,39.447933,8,4.930992
4,496,P.Manning,35.182539,10,3.518254


In [828]:
merged_df.head()

Unnamed: 0,Name,Power 5_3,Power 5_4,QBR_3,QBR_4,PAA_3,PAA_4,PLAYS_3,PLAYS_4,EPA_3,...,forty,vertical,broad_jump,cone,shuttle,name,index,total_wpa,seasons_played,wpa_per_season
0,A.J. McCarron,1.0,1.0,84.9,82.5,47.3,43.2,387.0,404.0,49.6,...,4.94,28.0,99.0,7.18,4.34,A.McCarron,19.0,0.047844,6.0,0.007974
1,Aaron Murray,1.0,1.0,80.9,88.1,51.1,59.1,503.0,443.0,61.3,...,4.84,31.162602,111.786008,7.122902,4.343057,A.Murray,,,,
2,Aaron Rodgers,1.0,1.0,72.685937,75.1,23.609231,29.5,418.009721,422.0,37.669882,...,4.71,34.5,110.0,7.38,4.343057,A.Rodgers,27.0,41.700951,20.0,2.085048
3,Aidan O'Connell,1.0,1.0,85.7,71.4,65.7,24.1,496.0,576.0,77.3,...,4.799668,31.162602,111.786008,7.122902,4.343057,A.O'Connell,22.0,-1.468604,2.0,-0.734302
4,Alex Brink,1.0,1.0,63.7,70.5,6.5,24.0,506.0,609.0,26.9,...,4.96,28.0,112.0,7.22,4.19,A.Brink,,,,


In [829]:
unmatched = merged_df[merged_df['total_wpa'].isnull()]
unmatched

Unnamed: 0,Name,Power 5_3,Power 5_4,QBR_3,QBR_4,PAA_3,PAA_4,PLAYS_3,PLAYS_4,EPA_3,...,forty,vertical,broad_jump,cone,shuttle,name,index,total_wpa,seasons_played,wpa_per_season
1,Aaron Murray,1.000000,1.000000,80.900000,88.100000,51.100000,59.100000,503.000000,443.000000,61.300000,...,4.840000,31.162602,111.786008,7.122902,4.343057,A.Murray,,,,
4,Alex Brink,1.000000,1.000000,63.700000,70.500000,6.500000,24.000000,506.000000,609.000000,26.900000,...,4.960000,28.000000,112.000000,7.220000,4.190000,A.Brink,,,,
9,Anthony Boone,1.000000,1.000000,74.800000,61.300000,26.000000,11.400000,412.000000,584.000000,40.100000,...,5.030000,26.500000,100.000000,7.470000,4.620000,A.Boone,,,,
10,Anthony Gordon,1.000000,1.000000,75.686288,78.200000,73.068570,91.300000,776.586781,784.000000,96.058199,...,4.799668,31.162602,111.786008,7.122902,4.343057,A.Gordon,,,,
11,Anthony Morelli,1.000000,1.000000,43.000000,55.700000,-20.800000,1.300000,470.000000,482.000000,1.500000,...,5.070000,26.500000,108.000000,7.430000,4.600000,A.Morelli,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,Zac Robinson,1.000000,1.000000,79.900000,48.300000,54.900000,-11.800000,506.000000,455.000000,63.100000,...,4.680000,35.000000,110.000000,7.240000,4.400000,Z.Robinson,,,,
308,Zac Taylor,1.000000,1.000000,48.400000,63.800000,-21.300000,18.800000,568.000000,493.000000,5.200000,...,5.060000,30.500000,110.000000,7.390000,4.600000,Z.Taylor,,,,
309,Zac Thomas,0.000000,0.000000,69.000000,60.900000,55.000000,36.300000,503.000000,426.000000,67.000000,...,4.590000,30.000000,115.000000,7.060000,4.280000,Z.Thomas,,,,
311,Zach Smith,0.000000,0.000000,51.800000,50.800000,0.100000,-5.700000,554.000000,337.000000,23.100000,...,5.230000,29.500000,111.000000,7.260000,4.520000,Z.Smith,,,,


In [830]:
len(unmatched)

112

In [831]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 43 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            325 non-null    object 
 1   Power 5_3       325 non-null    float64
 2   Power 5_4       325 non-null    float64
 3   QBR_3           325 non-null    float64
 4   QBR_4           325 non-null    float64
 5   PAA_3           325 non-null    float64
 6   PAA_4           325 non-null    float64
 7   PLAYS_3         325 non-null    float64
 8   PLAYS_4         325 non-null    float64
 9   EPA_3           325 non-null    float64
 10  EPA_4           325 non-null    float64
 11  PASS_3          325 non-null    float64
 12  PASS_4          325 non-null    float64
 13  RUN_3           325 non-null    float64
 14  RUN_4           325 non-null    float64
 15  SACK_3          325 non-null    float64
 16  SACK_4          325 non-null    float64
 17  PEN_3           325 non-null    flo

In [832]:
missing_qbs = merged_df[merged_df['total_wpa'].isnull()]

In [833]:
missing_qbs[['Name', 'name']]

Unnamed: 0,Name,name
1,Aaron Murray,A.Murray
4,Alex Brink,A.Brink
9,Anthony Boone,A.Boone
10,Anthony Gordon,A.Gordon
11,Anthony Morelli,A.Morelli
...,...,...
307,Zac Robinson,Z.Robinson
308,Zac Taylor,Z.Taylor
309,Zac Thomas,Z.Thomas
311,Zach Smith,Z.Smith


In [834]:
list(missing_qbs['Name'])

['Aaron Murray',
 'Alex Brink',
 'Anthony Boone',
 'Anthony Gordon',
 'Anthony Morelli',
 'Austin Allen',
 'Austin Reed',
 'Bernard Morris',
 'Blake Sims',
 'Brad Kaaya',
 'Brady White',
 'Brandon Bridge',
 'Brandon Doughty',
 'Brian Lewerke',
 'Bryan Randall',
 'Bryn Renner',
 'Carson Strong',
 'Chandler Harnish',
 'Chase Litton',
 'Chris Leak',
 'Christian Hackenberg',
 'Clayton Thorson',
 'Cody Fajardo',
 'Cole McDonald',
 'Collin Klein',
 'Connor Halliday',
 'Cullen Harper',
 "D'Eriq King",
 'D.J. Shockley',
 'Dan LeFevour',
 'Danny Etling',
 'Darrell Hackney',
 'Daryll Clark',
 'David Greene',
 'Devin Leary',
 'Drew Olson',
 'Drew Willy',
 'Dustin Crum',
 'Erik Ainge',
 'Garrett Grayson',
 'Gino Guidugli',
 'Graham Mertz',
 'Hunter Cantwell',
 'Isaiah Stanback',
 'J.J. McCarthy',
 'J.T. Barrett',
 'Jack Coan',
 'Jacory Harris',
 'James Pinkney',
 'James Vandenberg',
 'Jamie Newman',
 'Jeff Van Camp',
 'Jerod Evans',
 'Jevan Snead',
 'Joel Stave',
 'John David Booty',
 'John Parker

In [835]:
merged_df = merged_df.drop(columns = ['name', 'index'])

In [836]:
missing_qb_names = list(missing_qbs['Name'])

In [837]:
merged_df.head()

Unnamed: 0,Name,Power 5_3,Power 5_4,QBR_3,QBR_4,PAA_3,PAA_4,PLAYS_3,PLAYS_4,EPA_3,...,ht,wt,forty,vertical,broad_jump,cone,shuttle,total_wpa,seasons_played,wpa_per_season
0,A.J. McCarron,1.0,1.0,84.9,82.5,47.3,43.2,387.0,404.0,49.6,...,75.0,220.0,4.94,28.0,99.0,7.18,4.34,0.047844,6.0,0.007974
1,Aaron Murray,1.0,1.0,80.9,88.1,51.1,59.1,503.0,443.0,61.3,...,72.0,207.0,4.84,31.162602,111.786008,7.122902,4.343057,,,
2,Aaron Rodgers,1.0,1.0,72.685937,75.1,23.609231,29.5,418.009721,422.0,37.669882,...,74.0,223.0,4.71,34.5,110.0,7.38,4.343057,41.700951,20.0,2.085048
3,Aidan O'Connell,1.0,1.0,85.7,71.4,65.7,24.1,496.0,576.0,77.3,...,75.0,213.0,4.799668,31.162602,111.786008,7.122902,4.343057,-1.468604,2.0,-0.734302
4,Alex Brink,1.0,1.0,63.7,70.5,6.5,24.0,506.0,609.0,26.9,...,74.0,211.0,4.96,28.0,112.0,7.22,4.19,,,


In [838]:
merged_df[merged_df['Name'] == 'Carson Wentz']

Unnamed: 0,Name,Power 5_3,Power 5_4,QBR_3,QBR_4,PAA_3,PAA_4,PLAYS_3,PLAYS_4,EPA_3,...,ht,wt,forty,vertical,broad_jump,cone,shuttle,total_wpa,seasons_played,wpa_per_season
324,Carson Wentz,0.750799,0.747604,67.72641,69.901917,31.627629,37.591054,500.484545,514.175719,46.220876,...,77.0,237.0,4.77,30.5,118.0,6.86,4.15,0.400901,9.0,0.044545


# More factors for the target variable
We'll bring in a dataset that includes each QB's touchdown passes per season and the number of all-pro seasons in their career.

In [839]:
td_df = pd.read_csv('qb_career.csv')

In [840]:
td_df.head()

Unnamed: 0,season,to,round,pick,team,pfr_player_name,pass_tds,allpro,seasons_started,hof,career_length,tds_per_season
0,2017,2024.0,1,10,KAN,Patrick Mahomes,245.0,2,7,False,8.0,30.625
1,2024,2024.0,1,12,DEN,Bo Nix,29.0,0,1,False,1.0,29.0
2,2020,2024.0,1,1,CIN,Joe Burrow,140.0,0,5,False,5.0,28.0
3,2018,2024.0,1,7,BUF,Josh Allen,195.0,0,7,False,7.0,27.857143
4,2020,2024.0,1,6,LAC,Justin Herbert,137.0,0,5,False,5.0,27.4


In [841]:
td_df['allpro_per_season'] = np.where(
    td_df['career_length'] == 0,
    0,
    td_df['allpro'] / td_df['career_length']
)

In [842]:
td_df = td_df[['pfr_player_name', 'pass_tds', 'tds_per_season', 'allpro', 'allpro_per_season']]

In [843]:
td_df = td_df.rename(columns = {'pfr_player_name': 'Name'})

In [844]:
merged_df['Name'].nunique()

325

In [845]:
td_df['Name'].nunique()

236

In [846]:
# We need to match up a few names
td_name_change = {'EJ Manuel': 'E.J. Manuel', 'Gardner Minshew II': 'Gardner Minshew', 'Michael Penix': 'Michael Penix Jr.',\
                 'Robert Griffin III': 'Robert Griffin'}

In [847]:
td_df['Name'] = td_df['Name'].replace(td_name_change)

In [848]:
td_df.head()

Unnamed: 0,Name,pass_tds,tds_per_season,allpro,allpro_per_season
0,Patrick Mahomes,245.0,30.625,2,0.25
1,Bo Nix,29.0,29.0,0,0.0
2,Joe Burrow,140.0,28.0,0,0.0
3,Josh Allen,195.0,27.857143,0,0.0
4,Justin Herbert,137.0,27.4,0,0.0


There appear to be 19 players in our main dataset who aren't in the td dataset we just brought in because because they weren't drafted. We might need to fill in the data by hand from Pro Football Reference since the Python package doesn't have data before 2018.

In [849]:
td_append_cols = ['Name', 'pass_tds', 'tds_per_season', 'allpro', 'allpro_per_season']

In [850]:
td_append_data = [['Brett Rypien', 4, 1, 0, 0],
                  ['Brian Hoyer', 53, 3.533, 0, 0],
                  ['Case Keenum', 79, 7.1818, 0, 0],
                  ['Chase Daniel', 9, .6923, 0, 0],
                  ['Cooper Rush', 20, 2.857, 0, 0],
                  ['David Blough', 6, 1.5, 0, 0],
                  ['Scott Tolzien', 2, 0.5, 0, 0],
                  ['Taylor Heinicke', 39, 5.571, 0, 0],
                  ['Jake Browning', 12, 6, 0, 0],
                  ['Kellen Moore', 4, 4, 0, 0],
                  ['Kyle Allen', 26, 3.714, 0, 0],
                  ['Luke McCown', 9, .9, 0, 0],
                  ['Matt Moore', 49, 4.455, 0, 0],
                  ['Nick Mullens', 34, 4.857, 0, 0],
                  ['Taysom Hill', 11, 1.375, 0, 0],
                  ['Tommy DeVito', 8, 4, 0, 0],
                  ['Tyler Bray', 0, 0, 0, 0],
                  ['Tyler Huntley', 11, 2.2, 0, 0],
                  ['PJ Walker', 6, 1.5, 0, 0],
                  ['Marcus Vick', 0, 0, 0, 0],
                  ['Terrelle Pryor Sr.', 9, .77, 0, 0]
                 ]

In [851]:
td_append_df = pd.DataFrame(td_append_data, columns = td_append_cols)

In [852]:
td_df = pd.concat([td_df, td_append_df])

In [853]:
merged_names = set(list(merged_df['Name']))
td_names = set(list(td_df['Name']))

In [854]:
merged_not_in_td = merged_names.difference(td_names)
td_not_in_merged = td_names.difference(merged_names)

In [855]:
len(merged_not_in_td), len(td_not_in_merged)

(96, 28)

In [856]:
merged_not_in_td

{'Anthony Boone',
 'Anthony Gordon',
 'Anthony Morelli',
 'Austin Allen',
 'Austin Davis',
 'Austin Reed',
 'Bernard Morris',
 'Blake Sims',
 'Brady White',
 'Brandon Bridge',
 'Brett Basanez',
 'Brian Lewerke',
 'Brock Berlin',
 'Bryan Randall',
 'Bryn Renner',
 'Carson Strong',
 'Chase Litton',
 'Chris Leak',
 'Cody Fajardo',
 'Collin Klein',
 'Connor Halliday',
 'Connor Shaw',
 'Cullen Harper',
 "D'Eriq King",
 'Darrell Hackney',
 'Darron Thomas',
 'Daryll Clark',
 'Drew Olson',
 'Drew Willy',
 'Dustin Crum',
 'Feleipe Franks',
 'Gino Guidugli',
 'Graham Harrell',
 'Graham Mertz',
 'Hunter Cantwell',
 'J.T. Barrett',
 'Jack Coan',
 'Jacory Harris',
 'James Pinkney',
 'James Vandenberg',
 'Jamie Newman',
 'Jarrett Brown',
 'Jason White',
 'Jeff Van Camp',
 'Jerod Evans',
 'Jerrod Johnson',
 'Jevan Snead',
 'Joel Stave',
 'John Parker Wilson',
 'John Stocco',
 'John Wolford',
 'Jordan Jefferson',
 'Jordan Lynch',
 "Jordan Ta'amu",
 'Josh Swogger',
 'Kaleb Eleby',
 'Kedon Slovis',
 'Ke

In [857]:
td_not_in_merged

{'Adrian McPherson',
 'Alex McGough',
 'Andre Woodson',
 'B.J. Coleman',
 'B.J. Daniels',
 'Ben DiNucci',
 'Brad Smith',
 'Brad Sorensen',
 'Chad Kelly',
 'Chris Oladokun',
 'Garrett Gilbert',
 'Ingle Martin',
 'Jake Rudock',
 'James Kilian',
 'Jeff Rowe',
 'Jonathan Crompton',
 'Keith Null',
 'Kellen Mond',
 'Kyle Lauletta',
 'Matt Cassel',
 'Mike Teel',
 'Nathan Enderle',
 'Rusty Smith',
 'Sean Clifford',
 'Stefan Lefors',
 'Tom Brandstater',
 'Tommy Stevens',
 'Tyler Thigpen'}

In [858]:
qb_career_df = merged_df.merge(td_df, on = 'Name', how = 'left')

In [859]:
qb_career_df.head()

Unnamed: 0,Name,Power 5_3,Power 5_4,QBR_3,QBR_4,PAA_3,PAA_4,PLAYS_3,PLAYS_4,EPA_3,...,broad_jump,cone,shuttle,total_wpa,seasons_played,wpa_per_season,pass_tds,tds_per_season,allpro,allpro_per_season
0,A.J. McCarron,1.0,1.0,84.9,82.5,47.3,43.2,387.0,404.0,49.6,...,99.0,7.18,4.34,0.047844,6.0,0.007974,6.0,0.6,0.0,0.0
1,Aaron Murray,1.0,1.0,80.9,88.1,51.1,59.1,503.0,443.0,61.3,...,111.786008,7.122902,4.343057,,,,0.0,0.0,0.0,0.0
2,Aaron Rodgers,1.0,1.0,72.685937,75.1,23.609231,29.5,418.009721,422.0,37.669882,...,110.0,7.38,4.343057,41.700951,20.0,2.085048,503.0,25.15,4.0,0.2
3,Aidan O'Connell,1.0,1.0,85.7,71.4,65.7,24.1,496.0,576.0,77.3,...,111.786008,7.122902,4.343057,-1.468604,2.0,-0.734302,20.0,10.0,0.0,0.0
4,Alex Brink,1.0,1.0,63.7,70.5,6.5,24.0,506.0,609.0,26.9,...,112.0,7.22,4.19,,,,0.0,0.0,0.0,0.0


In [860]:
qb_career_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               325 non-null    object 
 1   Power 5_3          325 non-null    float64
 2   Power 5_4          325 non-null    float64
 3   QBR_3              325 non-null    float64
 4   QBR_4              325 non-null    float64
 5   PAA_3              325 non-null    float64
 6   PAA_4              325 non-null    float64
 7   PLAYS_3            325 non-null    float64
 8   PLAYS_4            325 non-null    float64
 9   EPA_3              325 non-null    float64
 10  EPA_4              325 non-null    float64
 11  PASS_3             325 non-null    float64
 12  PASS_4             325 non-null    float64
 13  RUN_3              325 non-null    float64
 14  RUN_4              325 non-null    float64
 15  SACK_3             325 non-null    float64
 16  SACK_4             325 non

In [861]:
qb_career_df = qb_career_df.fillna(qb_career_df.min(numeric_only=True))


In [862]:
qb_career_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               325 non-null    object 
 1   Power 5_3          325 non-null    float64
 2   Power 5_4          325 non-null    float64
 3   QBR_3              325 non-null    float64
 4   QBR_4              325 non-null    float64
 5   PAA_3              325 non-null    float64
 6   PAA_4              325 non-null    float64
 7   PLAYS_3            325 non-null    float64
 8   PLAYS_4            325 non-null    float64
 9   EPA_3              325 non-null    float64
 10  EPA_4              325 non-null    float64
 11  PASS_3             325 non-null    float64
 12  PASS_4             325 non-null    float64
 13  RUN_3              325 non-null    float64
 14  RUN_4              325 non-null    float64
 15  SACK_3             325 non-null    float64
 16  SACK_4             325 non

In [863]:
# Columns to scale
scale_cols = [
    'total_wpa',
    'wpa_per_season',
    'pass_tds',
    'tds_per_season',
    'allpro',
    'allpro_per_season'
]

# Scale each
for col in scale_cols:
    scaled_col = f'{col}_scaled'
    qb_career_df[scaled_col] = (
        qb_career_df[col] - qb_career_df[col].mean()
    ) / qb_career_df[col].std()

# Weighted QB score
qb_career_df['QB_score'] = (
    0.1 * qb_career_df['total_wpa_scaled'] +
    0.3 * qb_career_df['wpa_per_season_scaled'] +
    0.2 * qb_career_df['pass_tds_scaled'] +
    0.1 * qb_career_df['tds_per_season_scaled'] +
    0.2 * qb_career_df['allpro_scaled'] +
    0.1 * qb_career_df['allpro_per_season_scaled']
)

# # Standardize total_wpa
# qb_career_df['wpa_total_scaled'] = (
#     qb_career_df['total_wpa'] - qb_career_df['total_wpa'].mean()
# ) / qb_career_df['total_wpa'].std()

# # Standardize wpa_per_season
# qb_career_df['wpa_per_season_scaled'] = (
#     qb_career_df['wpa_per_season'] - qb_career_df['wpa_per_season'].mean()
# ) / qb_career_df['wpa_per_season'].std()

# # Standardize pass_tds
# qb_career_df['pass_tds_scaled'] = (
#     qb_career_df['pass_tds'] - qb_career_df['pass_tds'].mean()
# ) / qb_career_df['pass_tds'].std()

# # Standardize tds_per_season
# qb_career_df['tds_per_season_scaled'] = (
#     qb_career_df['tds_per_season'] - qb_career_df['tds_per_season'].mean()
# ) / qb_career_df['tds_per_season'].std()

# # Standardize allpro
# qb_career_df['allpro_scaled'] = (
#     qb_career_df['allpro'] - qb_career_df['allpro'].mean()
# ) / qb_career_df['allpro'].std()

# # Standardize allpro_per_season
# qb_career_df['allpro_per_season_scaled'] = (
#     qb_career_df['allpro_per_season'] - qb_career_df['allpro_per_season'].mean()
# ) / qb_career_df['allpro_per_season'].std()

# # Create weighted QB score with more emphasis on per-season performance
# qb_career_df['QB_score'] = (
#     0.2 * qb_career_df['wpa_total_scaled'] +
#     0.1 * qb_career_df['wpa_per_season_scaled'] +
#     0.25 * qb_career_df['pass_tds_scaled'] +
#     0.15 * qb_career_df['pass_tds_per_season_scaled'] +
#     0.2 * qb_career_df['allpro_scaled'] +
#     0.1 * qb_career_df['allpro_per_season_scaled']
# )

In [864]:
qb_career_df = qb_career_df.sort_values(by = 'QB_score', ascending = False).reset_index(drop = True)

In [865]:
qb_career_df.index = qb_career_df.index + 1

In [866]:
qb_career_df[['Name', 'QB_score']].iloc[30:60, :]

Unnamed: 0,Name,QB_score
31,Alex Smith,0.745698
32,Bo Nix,0.739506
33,Carson Wentz,0.724254
34,Jordan Love,0.611625
35,Colin Kaepernick,0.600857
36,Geno Smith,0.591245
37,Marcus Mariota,0.527205
38,Jake Browning,0.525091
39,Gardner Minshew,0.511692
40,Trevor Lawrence,0.508303


In [867]:
qb_career_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 1 to 325
Data columns (total 52 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      325 non-null    object 
 1   Power 5_3                 325 non-null    float64
 2   Power 5_4                 325 non-null    float64
 3   QBR_3                     325 non-null    float64
 4   QBR_4                     325 non-null    float64
 5   PAA_3                     325 non-null    float64
 6   PAA_4                     325 non-null    float64
 7   PLAYS_3                   325 non-null    float64
 8   PLAYS_4                   325 non-null    float64
 9   EPA_3                     325 non-null    float64
 10  EPA_4                     325 non-null    float64
 11  PASS_3                    325 non-null    float64
 12  PASS_4                    325 non-null    float64
 13  RUN_3                     325 non-null    float64
 14  RUN_4     

In [868]:
cols_to_drop = [ 'total_wpa',\
       'seasons_played', 'wpa_per_season', 'pass_tds', 'tds_per_season',\
       'allpro', 'allpro_per_season', 'total_wpa_scaled',\
       'wpa_per_season_scaled', 'pass_tds_scaled', 'tds_per_season_scaled',\
       'allpro_scaled', 'allpro_per_season_scaled', ]

In [869]:
qb_career_df = qb_career_df.drop(columns = cols_to_drop)

In [870]:
qb_career_df.to_csv('new_dataset_for_model.csv', index = False)