# What Makes a Legendary Running Back Season?
We define "legendary running back season" as a season where a running back scored over 20.0 PPR points per game. We will get the legendary running back seasons from the past 4 seasons and look at those running backs statisitics from the year before their legendary season to look for trends. We will be looking at the stats identified to be relevent to RB production my earlier "Stats That Matter" notebook.

In [16]:
# IMPORT PACKAGES
import pandas as pd
import sqlalchemy as sa
import pymysql
import os
from sqlalchemy import create_engine
import dotenv
from dotenv import load_dotenv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cryptography
from sklearn.linear_model import LinearRegression

In [17]:
# CONNECT TO LOCAL FANTASY SQL DATABASE
# DEFINE THE DATABASE CREDENTIALS
load_dotenv()
HOST = os.environ.get("MYSQL_HOST")
USER = os.environ.get("MYSQL_USER")
PASSWORD = os.environ.get("MYSQL_PASSWORD")
port = 3306
database = 'fantasydb'

# PYTHON FUNCTION TO CONNECT TO THE MYSQL DATABASE AND
# RETURN THE SQLACHEMY ENGINE OBJECT
def get_connection():
	return create_engine(
		url="mysql+pymysql://{0}:{1}@{2}:{3}/{4}".format(
			USER, PASSWORD, HOST, port, database
		)
	)

# CONNECT TO SQL DATABASE
try:
    # GET THE CONNECTION OBJECT (ENGINE) FOR THE DATABASE
    engine = get_connection()
    print(
        f"Connection to the {HOST} for user {USER} created successfully.")
except Exception as ex:
    print("Connection could not be made due to the following error: \n", ex)

Connection to the localhost for user root created successfully.


In [18]:
# GET DATAFRAME OF ALL PLAYERS OF *POSITION* IN *YEAR*
def retrieve_players(pos, year):
    with engine.begin() as conn:
        df = pd.read_sql_query(sa.text(
            f'''
            SELECT 
                *
            FROM 
                {pos}_stats_{year}
            '''), conn)
        conn.close()
    return df

# GET THE TOP *RANK* *POSITION* PLAYERS OF *YEAR* AND THEIR *POSITION* STATS FROM PRIOR *YEAR*
def top_x_players_prior_stats(pos, year, rank):
    valid_year = ['2019', '2020', '2021', '2022', '2023']
    valid_pos = ['qb', 'rb', 'wr', 'te']
    
    if year not in valid_year:
        return print('Not a valid year: 2019-2023')
    if pos not in valid_pos:
        return print('Not a valid position')
    
    prior_year= str(int(year) -1)
    age_change = 2023 - int(prior_year)
    year = int(year)
    
    with engine.begin() as conn:
        df = pd.read_sql_query(sa.text(
            f'''
            SELECT
                {pos}_stats_{year}.NAME as NAME1,
                {pos}_stats_{year}.RANK as RANK1,
                ppr_adp_{prior_year}.AGE,
                {pos}_stats_{prior_year}.*
            FROM 
                {pos}_stats_{prior_year}
            INNER JOIN 
                {pos}_stats_{year} 
            ON 
                {pos}_stats_{year}.NAME = {pos}_stats_{prior_year}.NAME
            INNER JOIN
                ppr_adp_{prior_year}
            ON
                {pos}_stats_{prior_year}.NAME = ppr_adp_{prior_year}.NAME
                
            WHERE
                ({pos}_stats_{year}.FPTS / {pos}_stats_{year}.G) >= 22.5
            ORDER BY
                {pos}_stats_{year}.RANK ASC            
            '''), conn)
        conn.close()
        df = df.drop(columns=['index', 'RANK', 'NAME'])
        df["AGE"] = df["AGE"].subtract(age_change)
        df["YEAR"]=year
                
        df.columns = ['PRIOR_' + str(col) for col in df.columns]
        df = df.rename(columns={'PRIOR_NAME1':'NAME', 'PRIOR_RANK1':'PRIOR_RANK', })
    return df

In [19]:
df1 = top_x_players_prior_stats('rb', '2019', '12')
df2 = top_x_players_prior_stats('rb', '2020', '12')
df3 = top_x_players_prior_stats('rb', '2021', '12')
df4 = top_x_players_prior_stats('rb', '2022', '12')
df5 = top_x_players_prior_stats('rb', '2023', '12')

legendary_rbs = pd.concat([df1, df2, df3, df4,df5], ignore_index=True)

rush_yds_per_game = round(legendary_rbs['PRIOR_RUS_YDS']/legendary_rbs['PRIOR_G'], 1)
ppr_per_game = round(legendary_rbs['PRIOR_FPTS']/legendary_rbs['PRIOR_G'], 1)
rec_yds_per_game = round(legendary_rbs['PRIOR_REC_YDS']/legendary_rbs['PRIOR_G'], 1)

legendary_rbs['PRIOR_RUS_YDS_PER_G'] = rush_yds_per_game
legendary_rbs['PRIOR_REC_YDS_PER_G'] = rec_yds_per_game
legendary_rbs['PRIOR_PPR_POINTS_PER_G'] = ppr_per_game
legendary_rbs['YEAR'] = legendary_rbs['PRIOR_YEAR']



legendary_rbs = legendary_rbs[['NAME', 'YEAR','PRIOR_RANK', 'PRIOR_AGE', 
           'PRIOR_RUS_YDS_PER_G', 'PRIOR_REC_YDS_PER_G', 'PRIOR_PPR_POINTS_PER_G',
           'PRIOR_IMP_ATT', 'PRIOR_SNAPS', 'PRIOR_BRKTKL', 
           'PRIOR_GRZ_ATT', 'PRIOR_EXPLO', ]]

desc_rb= legendary_rbs.describe()
legendary_rbs

  legendary_rbs = pd.concat([df1, df2, df3, df4,df5], ignore_index=True)


Unnamed: 0,NAME,YEAR,PRIOR_RANK,PRIOR_AGE,PRIOR_RUS_YDS_PER_G,PRIOR_REC_YDS_PER_G,PRIOR_PPR_POINTS_PER_G,PRIOR_IMP_ATT,PRIOR_SNAPS,PRIOR_BRKTKL,PRIOR_GRZ_ATT,PRIOR_EXPLO
0,Christian McCaffrey,2019,1.0,22.0,68.6,54.2,24.1,343.0,966.0,15.0,16.0,15.0
1,Alvin Kamara,2020,1.0,24.0,56.9,38.1,17.8,268.0,626.0,29.0,7.0,8.0
2,Dalvin Cook,2020,2.0,23.0,81.1,37.1,20.9,313.0,604.0,20.0,21.0,12.0
3,Christian McCaffrey,2020,54.0,23.0,86.7,62.8,29.4,429.0,1039.0,16.0,19.0,18.0
4,Derrick Henry,2021,22.0,26.0,126.7,7.1,20.8,409.0,705.0,34.0,17.0,30.0
5,Christian McCaffrey,2023,1.0,26.0,67.0,43.6,21.0,352.0,777.0,10.0,11.0,14.0


## Legendary Running Backs (2019 - 2023)
In the span of 4 years, there have been 13 legendary running back seasons. What we want to do is look at their per game statistics and look for trends that may help us identify running backs from this past season who have higher chance to enter the legendary running back tier.

In [20]:
desc_rb

Unnamed: 0,YEAR,PRIOR_RANK,PRIOR_AGE,PRIOR_RUS_YDS_PER_G,PRIOR_REC_YDS_PER_G,PRIOR_PPR_POINTS_PER_G,PRIOR_IMP_ATT,PRIOR_SNAPS,PRIOR_BRKTKL,PRIOR_GRZ_ATT,PRIOR_EXPLO
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,2020.5,13.5,24.0,81.166667,40.483333,22.333333,352.333333,786.166667,20.666667,15.166667,16.166667
std,1.378405,21.510463,1.67332,24.703819,19.117889,3.99483,59.724925,179.874864,9.114092,5.231316,7.547627
min,2019.0,1.0,22.0,56.9,7.1,17.8,268.0,604.0,10.0,7.0,8.0
25%,2020.0,1.0,23.0,67.4,37.35,20.825,320.5,645.75,15.25,12.25,12.5
50%,2020.0,1.5,23.5,74.85,40.85,20.95,347.5,741.0,18.0,16.5,14.5
75%,2020.75,17.0,25.5,85.3,51.55,23.325,394.75,918.75,26.75,18.5,17.25
max,2023.0,54.0,26.0,126.7,62.8,29.4,429.0,1039.0,34.0,21.0,30.0


One thing to notice is every single running back was under the age of 27 before their breakout.
- Avg. Prior Rush Yards Per Game - 70.3
- Avg. Rec. Yards Per Game - 34.11
- Avg. Prior Implied Attempts - 274.07
- Avg. Prior Snaps - 604.92
- Avg. Prior Broken Tackles - 16.62
- Avg. Prior Green Zone Attempts -10.46
- Avg. Prior Explosive Plays - 11.15

What I want to use to identify potential breakouts is Rush/Rec Yards per game and Implied Attempts and Snaps, rush/rec per game correlate with production and implied attempts and snaps correlate with volume/opporunity.

In [21]:
# GET DATAFRAME OF RUNNING BACKS FROM *YEAR* WHO HAVE LEGENDARY POTENTIAL
# i.e. STATS BETTER THAN LEGENDARY RUNNING BACK AVERAGE STATS MINUS ONE STANDARD DEVIATION
def retrieve_breakout_rbs(year):
    with engine.begin() as conn:
        df = pd.read_sql_query(sa.text(
            f'''
            SELECT
                rb_stats_{year}.*,
                ppr_adp_{str(int(year)+1)}.ADP,
                ppr_adp_{str(int(year)+1)}.AGE
            FROM 
                rb_stats_{year}
            INNER JOIN 
                ppr_adp_{str(int(year)+1)} 
            ON 
                rb_stats_{year}.NAME = ppr_adp_{str(int(year)+1)}.NAME 
            WHERE
                ppr_adp_{str(int(year)+1)}.AGE <= 26
            AND 
                (rb_stats_{year}.RUS_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_RUS_YDS_PER_G'][1]-(desc_rb['PRIOR_RUS_YDS_PER_G'][2])}
            AND 
                (rb_stats_{year}.REC_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_REC_YDS_PER_G'][1]-(desc_rb['PRIOR_REC_YDS_PER_G'][2])}
            AND 
                (rb_stats_{year}.IMP_ATT) > {desc_rb['PRIOR_IMP_ATT'][1]-(desc_rb['PRIOR_IMP_ATT'][2])}
            AND 
                (rb_stats_{year}.SNAPS) > {desc_rb['PRIOR_SNAPS'][1]-(desc_rb['PRIOR_SNAPS'][2])}
         
            ORDER BY
                rb_stats_{year}.RANK
            ASC
            '''), conn)
        conn.close()
        
        df = df[['NAME', 'RANK', 'ADP']]

    return df

# GET DATAFRAME COMPARING THE OVERALL RANK OF RUNNING BACKS FROM *retrieve_breakout_rbs()* 
# FROM *YEAR* AND THEIR OVERALL RANK THE FOLLOWING YEAR
def compare_breakout_rb(year):
    df1 = retrieve_breakout_rbs(year)
    df2 = retrieve_players('rb',str(int(year)+1))
    df3 = df1.merge(df2, on='NAME')
    df3 = df3[['NAME', 'RANK_x', 'ADP', 'RANK_y']].rename(columns={
        'RANK_x': year + '_RANK', 'ADP': year + '_ADP', 'RANK_y': str(int(year)+1)+'_RANK'
    })
    df3['CHANGE'] = df3[ year + '_RANK'] - df3[str(int(year)+1)+'_RANK']
    df3['IMPROVE'] = df3['CHANGE'] >= 0
    df3['TOP_5'] = df3[str(int(year)+1)+'_RANK'] <= 5
    df3 = df3[['NAME', year+'_ADP', year+'_RANK', str(int(year)+1)+'_RANK', 'CHANGE', 'IMPROVE', 'TOP_5']]
    return df3

In [22]:
retrieve_breakout_rbs('2023')

  (rb_stats_{year}.RUS_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_RUS_YDS_PER_G'][1]-(desc_rb['PRIOR_RUS_YDS_PER_G'][2])}
  (rb_stats_{year}.REC_YDS / rb_stats_{year}.G)  > {desc_rb['PRIOR_REC_YDS_PER_G'][1]-(desc_rb['PRIOR_REC_YDS_PER_G'][2])}
  (rb_stats_{year}.IMP_ATT) > {desc_rb['PRIOR_IMP_ATT'][1]-(desc_rb['PRIOR_IMP_ATT'][2])}
  (rb_stats_{year}.SNAPS) > {desc_rb['PRIOR_SNAPS'][1]-(desc_rb['PRIOR_SNAPS'][2])}


Unnamed: 0,NAME,RANK,ADP
0,Breece Hall,2.0,31
1,Travis Etienne Jr.,3.0,32
2,Rachaad White,4.0,62
3,Bijan Robinson,9.0,9
