# What Makes a Legendary Running Back Season?
We define "legendary running back season" as a season where a running back scored over 20.0 PPR points per game. We will get the legendary running back seasons from the past 4 seasons and look at those running backs statisitics from the year before their legendary season to look for trends. We will be looking at the stats identified to be relevent to RB production my earlier "Stats That Matter" notebook.

In [88]:
# IMPORT PACKAGES
import pandas as pd
import sqlalchemy as sa
import pymysql
import os
from sqlalchemy import create_engine
import dotenv
from dotenv import load_dotenv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cryptography
from sklearn.linear_model import LinearRegression

In [89]:
# CONNECT TO LOCAL FANTASY SQL DATABASE
# DEFINE THE DATABASE CREDENTIALS
load_dotenv()
HOST = os.environ.get("MYSQL_HOST")
USER = os.environ.get("MYSQL_USER")
PASSWORD = os.environ.get("MYSQL_PASSWORD")
port = 3306
database = 'fantasydb'

# PYTHON FUNCTION TO CONNECT TO THE MYSQL DATABASE AND
# RETURN THE SQLACHEMY ENGINE OBJECT
def get_connection():
	return create_engine(
		url="mysql+pymysql://{0}:{1}@{2}:{3}/{4}".format(
			USER, PASSWORD, HOST, port, database
		)
	)

# CONNECT TO SQL DATABASE
try:
    # GET THE CONNECTION OBJECT (ENGINE) FOR THE DATABASE
    engine = get_connection()
    print(
        f"Connection to the {HOST} for user {USER} created successfully.")
except Exception as ex:
    print("Connection could not be made due to the following error: \n", ex)

Connection to the localhost for user root created successfully.


In [90]:
# GET DATAFRAME OF ALL PLAYERS OF *POSITION* IN *YEAR*
def retrieve_players(pos, year):
    with engine.begin() as conn:
        df = pd.read_sql_query(sa.text(
            f'''
            SELECT 
                *
            FROM 
                {pos}_stats_{year}
            '''), conn)
        conn.close()
    return df

# GET THE TOP *RANK* *POSITION* PLAYERS OF *YEAR* AND THEIR *POSITION* STATS FROM PRIOR *YEAR*
def top_x_players_prior_stats(pos, year, rank):
    valid_year = ['2019', '2020', '2021', '2022']
    valid_pos = ['qb', 'rb', 'wr', 'te']
    
    if year not in valid_year:
        return print('Not a valid year: 2019-2022')
    if pos not in valid_pos:
        return print('Not a valid position')
    
    prior_year= str(int(year) -1)
    age_change = 2023 - int(prior_year)
    year = int(year)
    
    with engine.begin() as conn:
        df = pd.read_sql_query(sa.text(
            f'''
            SELECT
                {pos}_stats_{year}.NAME as NAME1,
                {pos}_stats_{year}.RANK as RANK1,
                ppr_adp_{prior_year}.AGE,
                {pos}_stats_{prior_year}.*
            FROM 
                {pos}_stats_{prior_year}
            INNER JOIN 
                {pos}_stats_{year} 
            ON 
                {pos}_stats_{year}.NAME = {pos}_stats_{prior_year}.NAME
            INNER JOIN
                ppr_adp_{prior_year}
            ON
                {pos}_stats_{prior_year}.NAME = ppr_adp_{prior_year}.NAME
                
            WHERE
                ({pos}_stats_{year}.FPTS / {pos}_stats_{year}.G) >= 20
            ORDER BY
                {pos}_stats_{year}.RANK ASC            
            '''), conn)
        conn.close()
        df = df.drop(columns=['index', 'RANK', 'NAME'])
        df["AGE"] = df["AGE"].subtract(age_change)
        df["YEAR"]=year
                
        df.columns = ['PRIOR_' + str(col) for col in df.columns]
        df = df.rename(columns={'PRIOR_NAME1':'NAME', 'PRIOR_RANK1':'PRIOR_RANK', })
    return df

In [91]:
df1 = top_x_players_prior_stats('rb', '2019', '12')
df2 = top_x_players_prior_stats('rb', '2020', '12')
df3 = top_x_players_prior_stats('rb', '2021', '12')
df4 = top_x_players_prior_stats('rb', '2022', '12')
legendary_rbs = pd.concat([df1, df2, df3, df4], ignore_index=True)

rush_yds_per_game = round(legendary_rbs['PRIOR_RUS_YDS']/legendary_rbs['PRIOR_G'], 1)
ppr_per_game = round(legendary_rbs['PRIOR_FPTS']/legendary_rbs['PRIOR_G'], 1)
rec_yds_per_game = round(legendary_rbs['PRIOR_REC_YDS']/legendary_rbs['PRIOR_G'], 1)

legendary_rbs['PRIOR_RUS_YDS_PER_G'] = rush_yds_per_game
legendary_rbs['PRIOR_REC_YDS_PER_G'] = rec_yds_per_game
legendary_rbs['PRIOR_PPR_POINTS_PER_G'] = ppr_per_game
legendary_rbs['YEAR'] = legendary_rbs['PRIOR_YEAR']



legendary_rbs = legendary_rbs[['NAME', 'YEAR','PRIOR_RANK', 'PRIOR_AGE', 
           'PRIOR_RUS_YDS_PER_G', 'PRIOR_REC_YDS_PER_G', 'PRIOR_PPR_POINTS_PER_G',
           'PRIOR_IMP_ATT', 'PRIOR_SNAPS', 'PRIOR_BRKTKL', 
           'PRIOR_GRZ_ATT', 'PRIOR_EXPLO', ]]

desc_rb= legendary_rbs.describe()
legendary_rbs

Unnamed: 0,NAME,YEAR,PRIOR_RANK,PRIOR_AGE,PRIOR_RUS_YDS_PER_G,PRIOR_REC_YDS_PER_G,PRIOR_PPR_POINTS_PER_G,PRIOR_IMP_ATT,PRIOR_SNAPS,PRIOR_BRKTKL,PRIOR_GRZ_ATT,PRIOR_EXPLO
0,Christian McCaffrey,2019,1.0,22.0,68.6,54.2,24.1,343.0,966.0,15.0,16.0,15.0
1,Dalvin Cook,2019,6.0,22.0,55.9,27.7,13.8,182.0,491.0,10.0,2.0,8.0
2,Alvin Kamara,2020,1.0,24.0,56.9,38.1,17.8,268.0,626.0,29.0,7.0,8.0
3,Dalvin Cook,2020,2.0,23.0,81.1,37.1,20.9,313.0,604.0,20.0,21.0,12.0
4,Derrick Henry,2020,3.0,25.0,102.7,13.7,19.6,327.0,589.0,29.0,8.0,20.0
5,Christian McCaffrey,2020,54.0,23.0,86.7,62.8,29.4,429.0,1039.0,16.0,19.0,18.0
6,Jonathan Taylor,2021,1.0,21.0,77.9,19.9,16.9,272.0,511.0,15.0,15.0,15.0
7,Austin Ekeler,2021,2.0,25.0,53.0,40.3,16.5,181.0,412.0,13.0,1.0,2.0
8,Derrick Henry,2021,22.0,26.0,126.7,7.1,20.8,409.0,705.0,34.0,17.0,30.0
9,Austin Ekeler,2022,1.0,26.0,56.9,40.4,21.5,300.0,731.0,13.0,16.0,3.0


## Legendary Running Backs (2019 - 2022)
In the span of 4 years, there have been 11 legendary running back seasons. What we want to do is look at their per game statistics and look for trends that may help us identify running backs from this past season who have higher chance to enter the legendary running back tier.

In [92]:
desc_rb

Unnamed: 0,YEAR,PRIOR_RANK,PRIOR_AGE,PRIOR_RUS_YDS_PER_G,PRIOR_REC_YDS_PER_G,PRIOR_PPR_POINTS_PER_G,PRIOR_IMP_ATT,PRIOR_SNAPS,PRIOR_BRKTKL,PRIOR_GRZ_ATT,PRIOR_EXPLO
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,2020.454545,8.636364,23.818182,75.409091,35.481818,19.954545,287.636364,631.454545,18.727273,11.363636,11.909091
std,1.035725,16.249755,1.721522,23.037902,17.086826,4.211737,92.239116,225.282651,8.174239,7.311262,8.938171
min,2019.0,1.0,21.0,53.0,7.1,13.8,140.0,272.0,10.0,1.0,0.0
25%,2020.0,1.0,22.5,56.9,23.8,17.35,225.0,501.0,13.0,5.0,5.5
50%,2020.0,2.0,24.0,68.6,38.1,19.6,300.0,604.0,15.0,15.0,12.0
75%,2021.0,4.5,25.0,83.9,44.7,21.2,335.0,718.0,24.5,16.5,16.5
max,2022.0,54.0,26.0,126.7,62.8,29.4,429.0,1039.0,34.0,21.0,30.0


One thing to notice is every single running back was under the age of 27 before their breakout.
- Avg. Prior Rush Yards Per Game - 75.41
- Avg. Rec. Yards Per Game - 35.48
- Avg. Prior Implied Attempts - 287.63
- Avg. Prior Snaps - 631.45
- Avg. Prior Broken Tackles - 18.72
- Avg. Prior Green Zone Attempts -11.36
- Avg. Prior Explosive Plays - 11.91

What I want to use to identify potential breakouts is Rush/Rec Yards per game and Implied Attempts and Snaps, rush/rec per game correlate with production and implied attempts and snaps correlate with volume/opporunity.

In [93]:
# GET DATAFRAME OF RUNNING BACKS FROM *YEAR* WHO HAVE LEGENDARY POTENTIAL
# i.e. STATS BETTER THAN LEGENDARY RUNNING BACK AVERAGE STATS MINUS ONE STANDARD DEVIATION
def retrieve_breakout_rbs(year):
    with engine.begin() as conn:
        df = pd.read_sql_query(sa.text(
            f'''
            SELECT
                rb_stats_{year}.*,
                ppr_adp_{str(int(year)+1)}.ADP,
                ppr_adp_{str(int(year)+1)}.AGE
            FROM 
                rb_stats_{year}
            INNER JOIN 
                ppr_adp_{str(int(year)+1)} 
            ON 
                rb_stats_{year}.NAME = ppr_adp_{str(int(year)+1)}.NAME 
            WHERE
                ppr_adp_{str(int(year)+1)}.AGE <= 27
            AND 
                (rb_stats_{year}.RUS_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_RUS_YDS_PER_G'][1]-desc_rb['PRIOR_RUS_YDS_PER_G'][2]}
            AND 
                (rb_stats_{year}.REC_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_REC_YDS_PER_G'][1]-desc_rb['PRIOR_REC_YDS_PER_G'][2]}
            AND 
                (rb_stats_{year}.IMP_ATT) > {desc_rb['PRIOR_IMP_ATT'][1]-desc_rb['PRIOR_IMP_ATT'][2]}
            AND 
                (rb_stats_{year}.SNAPS) > {desc_rb['PRIOR_SNAPS'][1]-desc_rb['PRIOR_SNAPS'][2]}
         
            ORDER BY
                ppr_adp_{str(int(year)+1)}.ADP
            ASC
            '''), conn)
        conn.close()
        
        df = df[['NAME', 'RANK', 'ADP']]

    return df

# GET DATAFRAME COMPARING THE OVERALL RANK OF RUNNING BACKS FROM *retrieve_breakout_rbs()* 
# FROM *YEAR* AND THEIR OVERALL RANK THE FOLLOWING YEAR
def compare_breakout_rb(year):
    df1 = retrieve_breakout_rbs(year)
    df2 = retrieve_players('rb',str(int(year)+1))
    df3 = df1.merge(df2, on='NAME')
    df3 = df3[['NAME', 'RANK_x', 'ADP', 'RANK_y']].rename(columns={
        'RANK_x': year + '_RANK', 'ADP': year + '_ADP', 'RANK_y': str(int(year)+1)+'_RANK'
    })
    df3['CHANGE'] = df3[ year + '_RANK'] - df3[str(int(year)+1)+'_RANK']
    df3['IMPROVE'] = df3['CHANGE'] >= 0
    df3['TOP_5'] = df3[str(int(year)+1)+'_RANK'] <= 5
    df3 = df3[['NAME', year+'_ADP', year+'_RANK', str(int(year)+1)+'_RANK', 'CHANGE', 'IMPROVE', 'TOP_5']]
    return df3

In [94]:
# COMPARE RBS WITH LEGENDARY POTENTIAL FROM 2018 TO 2021
br_rb_2018 = compare_breakout_rb('2018')
br_rb_2019 = compare_breakout_rb('2019')
br_rb_2020 = compare_breakout_rb('2020')
br_rb_2021 = compare_breakout_rb('2021')

  (rb_stats_{year}.RUS_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_RUS_YDS_PER_G'][1]-desc_rb['PRIOR_RUS_YDS_PER_G'][2]}
  (rb_stats_{year}.REC_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_REC_YDS_PER_G'][1]-desc_rb['PRIOR_REC_YDS_PER_G'][2]}
  (rb_stats_{year}.IMP_ATT) > {desc_rb['PRIOR_IMP_ATT'][1]-desc_rb['PRIOR_IMP_ATT'][2]}
  (rb_stats_{year}.SNAPS) > {desc_rb['PRIOR_SNAPS'][1]-desc_rb['PRIOR_SNAPS'][2]}
  (rb_stats_{year}.RUS_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_RUS_YDS_PER_G'][1]-desc_rb['PRIOR_RUS_YDS_PER_G'][2]}
  (rb_stats_{year}.REC_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_REC_YDS_PER_G'][1]-desc_rb['PRIOR_REC_YDS_PER_G'][2]}
  (rb_stats_{year}.IMP_ATT) > {desc_rb['PRIOR_IMP_ATT'][1]-desc_rb['PRIOR_IMP_ATT'][2]}
  (rb_stats_{year}.SNAPS) > {desc_rb['PRIOR_SNAPS'][1]-desc_rb['PRIOR_SNAPS'][2]}
  (rb_stats_{year}.RUS_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_RUS_YDS_PER_G'][1]-desc_rb['PRIOR_RUS_YDS_PER_G'][2]}
  (rb_stats_{year}.REC_YDS / rb_stats_{year}.G)  > {des

In [95]:
br_rb_2018

Unnamed: 0,NAME,2018_ADP,2018_RANK,2019_RANK,CHANGE,IMPROVE,TOP_5
0,Saquon Barkley,1.4,1.0,10.0,-9.0,False,False
1,Christian McCaffrey,3.2,2.0,1.0,1.0,True,True
2,Joe Mixon,18.4,10.0,13.0,-3.0,False,False
3,Kareem Hunt,100.8,12.0,47.0,-35.0,False,False


In [96]:
br_rb_2019

Unnamed: 0,NAME,2019_ADP,2019_RANK,2020_RANK,CHANGE,IMPROVE,TOP_5
0,Christian McCaffrey,1.2,1.0,54.0,-53.0,False,False
1,Saquon Barkley,2.5,10.0,120.0,-110.0,False,False
2,Dalvin Cook,6.2,6.0,2.0,4.0,True,True


In [97]:
br_rb_2020

Unnamed: 0,NAME,2020_ADP,2020_RANK,2021_RANK,CHANGE,IMPROVE,TOP_5
0,Dalvin Cook,2.5,2.0,16.0,-14.0,False,False
1,Jonathan Taylor,13.2,6.0,1.0,5.0,True,True
2,Clyde Edwards-Helaire,19.7,22.0,46.0,-24.0,False,False
3,David Montgomery,25.4,4.0,20.0,-16.0,False,False
4,James Robinson,27.1,7.0,24.0,-17.0,False,False
5,Kareem Hunt,59.7,10.0,49.0,-39.0,False,False


In [98]:
# Breakout Candidates for 2022 Season
br_rb_2021

Unnamed: 0,NAME,2021_ADP,2021_RANK,2022_RANK,CHANGE,IMPROVE,TOP_5
0,Jonathan Taylor,1.4,1.0,33.0,-32.0,False,False
1,Najee Harris,7.0,3.0,14.0,-11.0,False,False
2,Joe Mixon,10.0,4.0,10.0,-6.0,False,False
3,Javonte Williams,19.8,17.0,82.0,-65.0,False,False
4,David Montgomery,39.3,20.0,24.0,-4.0,False,False
5,Josh Jacobs,47.3,12.0,3.0,9.0,True,True


## Observations
Every year since 2018, from the list of potential legendary breakout running backs, at least one went on to not only improve their overall rank, but rank into the top 5 as well.

In [99]:
retrieve_breakout_rbs('2022')

  (rb_stats_{year}.RUS_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_RUS_YDS_PER_G'][1]-desc_rb['PRIOR_RUS_YDS_PER_G'][2]}
  (rb_stats_{year}.REC_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_REC_YDS_PER_G'][1]-desc_rb['PRIOR_REC_YDS_PER_G'][2]}
  (rb_stats_{year}.IMP_ATT) > {desc_rb['PRIOR_IMP_ATT'][1]-desc_rb['PRIOR_IMP_ATT'][2]}
  (rb_stats_{year}.SNAPS) > {desc_rb['PRIOR_SNAPS'][1]-desc_rb['PRIOR_SNAPS'][2]}


Unnamed: 0,NAME,RANK,ADP
0,Christian McCaffrey,2.0,2
1,Saquon Barkley,5.0,8
2,Tony Pollard,8.0,20
3,Josh Jacobs,3.0,21
4,Rhamondre Stevenson,7.0,24
5,Travis Etienne Jr.,17.0,32
6,Joe Mixon,10.0,34
