### Import libraries and create DataFrame from csv 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Function to read, preprocess, and aggregate full season data
def read_data():
    # Read the full season data
    data = pd.read_excel("RB_Data_2024.xlsx")

    # Preprocess full season data, drop irrelevant columns and convert data types
    data.drop(['FL', 'ROST'], axis=1, inplace=True)
    convert = data.select_dtypes('object').columns.difference(['Player'])
    data[convert] = data[convert].apply(lambda x: pd.to_numeric(x.str.replace(',', ''), errors='coerce')).fillna(0)
    data['Rank'] = data['Rank'].astype('Int64')

    return data

# Load the full season data
df = read_data()

# Recalculate 'Rank' based on 'FPTS' in descending order
df['Rank'] = df['FPTS'].rank(ascending=False, method='min').astype('int')

# Sort the DataFrame by 'Rank'
df = df.sort_values(by='Rank')
df.head(20)

Unnamed: 0,Rank,Player,ATT,YDS,Y/A,LG,20+,TD,REC,TGT,YDS.1,Y/R,TD.1,G,FPTS,FPTS/G
0,1,Alvin Kamara (NO),80,362,4.5,17,0,5,17,20,174,10.2,1,4,106.6,26.7
1,2,Saquon Barkley (PHI),73,435,6.0,65,4,4,12,15,85,7.1,1,4,98.0,24.5
2,3,Derrick Henry (BAL),80,480,6.0,87,6,5,5,7,45,9.0,1,4,93.5,23.4
3,4,Kyren Williams (LAR),73,258,3.5,14,0,5,13,14,68,5.2,1,4,81.6,20.4
4,5,Jordan Mason (SF),91,447,4.9,25,4,3,6,7,57,9.5,0,4,74.4,18.6
5,6,Jonathan Taylor (IND),72,349,4.8,29,4,4,6,10,77,12.8,0,4,72.6,18.2
6,7,Aaron Jones (MIN),64,321,5.0,39,1,1,16,19,143,8.9,1,4,72.4,18.1
7,8,Jahmyr Gibbs (DET),54,285,5.3,24,3,3,11,13,76,6.9,1,4,71.1,17.8
8,9,David Montgomery (DET),63,271,4.3,21,1,4,9,9,94,10.4,0,4,69.5,17.4
9,10,De'Von Achane (MIA),53,165,3.1,17,0,1,20,22,187,9.4,1,4,67.2,16.8


### Convert relevant stats to a per-game basis and create the final stats for analysis

In [2]:
# Define columns for per-game calculation
per_game = list(df.columns[2:])

# Define columns to exclude from the per-game calculation
exclude_per_game = ['Y/A', 'LG', 'Y/R', 'G', 'FPTS', 'FPTS/G']

# Create columns and convert stats to a per-game basis for the defined columns, not those in the exclude list
for col in per_game:
    if col not in exclude_per_game:
        df[col + '/game'] = (df[col] / df['G']).round(1)

# Define final columns to be used for analysis, combining excluded and new per-game columns
final_columns = exclude_per_game + [col + '/game' for col in per_game if col not in exclude_per_game]

# Display final columns with 'Rank' and 'Player'
df[['Rank', 'Player'] + final_columns].head(10)

Unnamed: 0,Rank,Player,Y/A,LG,Y/R,G,FPTS,FPTS/G,ATT/game,YDS/game,20+/game,TD/game,REC/game,TGT/game,YDS.1/game,TD.1/game
0,1,Alvin Kamara (NO),4.5,17,10.2,4,106.6,26.7,20.0,90.5,0.0,1.2,4.2,5.0,43.5,0.2
1,2,Saquon Barkley (PHI),6.0,65,7.1,4,98.0,24.5,18.2,108.8,1.0,1.0,3.0,3.8,21.2,0.2
2,3,Derrick Henry (BAL),6.0,87,9.0,4,93.5,23.4,20.0,120.0,1.5,1.2,1.2,1.8,11.2,0.2
3,4,Kyren Williams (LAR),3.5,14,5.2,4,81.6,20.4,18.2,64.5,0.0,1.2,3.2,3.5,17.0,0.2
4,5,Jordan Mason (SF),4.9,25,9.5,4,74.4,18.6,22.8,111.8,1.0,0.8,1.5,1.8,14.2,0.0
5,6,Jonathan Taylor (IND),4.8,29,12.8,4,72.6,18.2,18.0,87.2,1.0,1.0,1.5,2.5,19.2,0.0
6,7,Aaron Jones (MIN),5.0,39,8.9,4,72.4,18.1,16.0,80.2,0.2,0.2,4.0,4.8,35.8,0.2
7,8,Jahmyr Gibbs (DET),5.3,24,6.9,4,71.1,17.8,13.5,71.2,0.8,0.8,2.8,3.2,19.0,0.2
8,9,David Montgomery (DET),4.3,21,10.4,4,69.5,17.4,15.8,67.8,0.2,1.0,2.2,2.2,23.5,0.0
9,10,De'Von Achane (MIA),3.1,17,9.4,4,67.2,16.8,13.2,41.2,0.0,0.2,5.0,5.5,46.8,0.2


### Calculate the correlations for the final stats across different conditions

In [3]:
# Define columns and columns to exclude for correlation calculation, excluding FPTS and FPTS/G
exclude_corr = ['FPTS/G', 'FPTS', 'G']
corr_columns = [col for col in final_columns if col not in exclude_corr]

# Define a function to calculate correlations
def compute_correlations(dataframe, corr_columns):
    return dataframe[corr_columns].corrwith(dataframe['FPTS/G'])

# Compute correlations for various conditions
corr_all = compute_correlations(df, corr_columns)
corr_nonzero = compute_correlations(df[df['FPTS/G'] > 0], corr_columns)
corr_top50 = compute_correlations(df[df['Rank'] <= 50], corr_columns)
corr_top25 = compute_correlations(df[df['Rank'] <= 25], corr_columns)

# Compile all correlations into a DataFrame for comparison
df_corr = pd.DataFrame({
    'All Players': corr_all,
    'FPTS > 0': corr_nonzero,
    'Top 50 Players': corr_top50,
    'Top 25 Players': corr_top25
})

# Calculate the average correlation across the four conditions, adding 'Average' column to DataFrame
df_corr['Correlation'] = df_corr.mean(axis=1)

# Display the correlation DataFrame
df_corr.round(2)

Unnamed: 0,All Players,FPTS > 0,Top 50 Players,Top 25 Players,Correlation
Y/A,0.65,0.37,0.18,0.45,0.41
LG,0.76,0.62,0.43,0.18,0.5
Y/R,0.68,0.47,0.1,0.23,0.37
ATT/game,0.91,0.89,0.79,0.6,0.8
YDS/game,0.92,0.9,0.82,0.67,0.83
20+/game,0.64,0.62,0.55,0.46,0.57
TD/game,0.85,0.84,0.81,0.78,0.82
REC/game,0.84,0.8,0.47,-0.01,0.52
TGT/game,0.84,0.81,0.5,0.02,0.54
YDS.1/game,0.8,0.77,0.44,0.15,0.54


### Assign the weights for the final stats

In [4]:
# Calculate R^2 for the 'Average' correlation, adding 'R^2' column to DataFrame
df_corr['R^2'] = df_corr['Correlation'] ** 2

# Select stats with R^2 above a threshold for higher weight
high_weight_threshold = 0.5
specific_stats = df_corr[df_corr['R^2'] > high_weight_threshold].index.tolist()

# Select stats with R^2 below a threshold to exclude from final score
exclude_threshold = 0.1
exclude_stats = df_corr[df_corr['R^2'] < exclude_threshold].index.tolist()

# Define the calculation to assign weights
def weight_calc(row, specific_stats):
    if row.name in specific_stats:
        return 1 + row['R^2'] * 2 # Assign higher weight to specific stats
    else:
        return 1 + row['R^2'] # Assign weight to all other stats
    
# Assign weights based on the given criteria, adding 'Weight' column to DataFrame
df_corr['Weight'] = df_corr.apply(weight_calc, specific_stats=specific_stats, axis=1)

# Normalize weights
df_corr['Weight'] = df_corr['Weight'] / df_corr['Weight'].sum()

# Display the new columns in the DataFrame
df_corr[['Correlation', 'R^2', 'Weight']].round(2)

Unnamed: 0,Correlation,R^2,Weight
Y/A,0.41,0.17,0.07
LG,0.5,0.25,0.07
Y/R,0.37,0.14,0.07
ATT/game,0.8,0.63,0.13
YDS/game,0.83,0.69,0.14
20+/game,0.57,0.32,0.08
TD/game,0.82,0.67,0.14
REC/game,0.52,0.27,0.08
TGT/game,0.54,0.29,0.08
YDS.1/game,0.54,0.29,0.08


### Multiply the assigned weights to the final stats and calculate the score

In [5]:
# Multiply each relevant column by its corresponding weight
for col in corr_columns:
    if col not in exclude_stats:
        weight = df_corr.loc[col, 'Weight']
        df[col + '_weighted'] = (df[col] * weight).round(1)

# Extract the weighted columns
weight_columns = [col + '_weighted' for col in corr_columns if col not in exclude_stats]

# Display the new weighted stats columns
df_weight = df[['Rank', 'Player', 'FPTS/G'] + weight_columns]

# Define select columns to be used for the average weighted score
avg = weight_columns + (['FPTS/G'] * 2)  # Giving higher weight to FPTS/G

# Calculate the average weighted score for the select columns
df['Score'] = df[avg].mean(axis=1).round(2)

# Normalize the scores to be out of 10
scaler = MinMaxScaler(feature_range=(0, 10))
df['Score'] = scaler.fit_transform(df[['Score']])

df_weight.head(10)

Unnamed: 0,Rank,Player,FPTS/G,Y/A_weighted,LG_weighted,Y/R_weighted,ATT/game_weighted,YDS/game_weighted,20+/game_weighted,TD/game_weighted,REC/game_weighted,TGT/game_weighted,YDS.1/game_weighted,TD.1/game_weighted
0,1,Alvin Kamara (NO),26.7,0.3,1.3,0.7,2.7,12.7,0.0,0.2,0.3,0.4,3.3,0.0
1,2,Saquon Barkley (PHI),24.5,0.4,4.8,0.5,2.4,15.3,0.1,0.1,0.2,0.3,1.6,0.0
2,3,Derrick Henry (BAL),23.4,0.4,6.4,0.6,2.7,16.8,0.1,0.2,0.1,0.1,0.9,0.0
3,4,Kyren Williams (LAR),20.4,0.2,1.0,0.3,2.4,9.1,0.0,0.2,0.2,0.3,1.3,0.0
4,5,Jordan Mason (SF),18.6,0.3,1.8,0.6,3.1,15.7,0.1,0.1,0.1,0.1,1.1,0.0
5,6,Jonathan Taylor (IND),18.2,0.3,2.1,0.9,2.4,12.2,0.1,0.1,0.1,0.2,1.5,0.0
6,7,Aaron Jones (MIN),18.1,0.3,2.9,0.6,2.1,11.3,0.0,0.0,0.3,0.4,2.7,0.0
7,8,Jahmyr Gibbs (DET),17.8,0.4,1.8,0.5,1.8,10.0,0.1,0.1,0.2,0.2,1.4,0.0
8,9,David Montgomery (DET),17.4,0.3,1.6,0.7,2.1,9.5,0.0,0.1,0.2,0.2,1.8,0.0
9,10,De'Von Achane (MIA),16.8,0.2,1.3,0.6,1.8,5.8,0.0,0.0,0.4,0.4,3.6,0.0


### Model training

In [6]:
# Prepare features and target for model training
X = df[weight_columns]
y = df['Score']

# Handle missing values by imputing with mean
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(X, y, df.index, test_size=0.2, random_state=42)

# Initialize Ridge regression model
ridge_model = Ridge()

# Cross-validation to evaluate the model
cv_scores = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-Validation MSE: {-cv_scores.mean()}')

# Train the model
ridge_model.fit(X_train, y_train)

# Predict and evaluate on the test set
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Test Set MSE: {mse}')

# Incorporate Random Forest as an ensemble method
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
print(f'Random Forest Test Set MSE: {rf_mse}')

# Average predictions from both models for final score
final_pred = (y_pred + rf_pred) / 2

# Create a DataFrame for the test set predictions
test_results = pd.DataFrame({'Final_Score': final_pred}, index=test_idx)

# Merge the test set predictions back into the original DataFrame
df = df.merge(test_results, how='left', left_index=True, right_index=True)

# Fill NaN values in 'Final_Score' column with the original 'Score' to handle missing indices
df['Final_Score'].fillna(df['Score'], inplace=True)

# Normalize final scores to be out of 10
df['Final_Score'] = scaler.fit_transform(df[['Final_Score']]).round(2)

# Rank the final scores
df['Final Rank'] = df['Final_Score'].rank(method='first', ascending=False).astype(int)

# Calculate the variance in ranking both ranks
df['Variance'] = df['Rank'] - df['Final Rank']

Cross-Validation MSE: 0.6155892177456155
Test Set MSE: 0.6373473312330491
Random Forest Test Set MSE: 0.1438676730114685


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Final_Score'].fillna(df['Score'], inplace=True)


### Display final results and export to Excel

In [7]:
# List of columns to be excluded
final_columns_exclude = ['Y/R', 'LG']

# Exclude the specified columns from final_columns
final_columns = [col for col in final_columns if col not in final_columns_exclude]

# Create final analysis columns
analysis = df[['Rank', 'Final Rank', 'Player', 'Final_Score', 'Variance'] + final_columns]
analysis.set_index('Rank', inplace=True)
analysis = analysis.sort_values(by='Final Rank', ascending=True)

# Export to Excel
analysis.to_excel("RB_Analysis.xlsx", index=False)

# Display the top 30 rows
analysis.head(30)

Unnamed: 0_level_0,Final Rank,Player,Final_Score,Variance,Y/A,G,FPTS,FPTS/G,ATT/game,YDS/game,20+/game,TD/game,REC/game,TGT/game,YDS.1/game,TD.1/game
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1,Alvin Kamara (NO),10.0,0,4.5,4,106.6,26.7,20.0,90.5,0.0,1.2,4.2,5.0,43.5,0.2
3,2,Derrick Henry (BAL),9.98,1,6.0,4,93.5,23.4,20.0,120.0,1.5,1.2,1.2,1.8,11.2,0.2
2,3,Saquon Barkley (PHI),9.93,-1,6.0,4,98.0,24.5,18.2,108.8,1.0,1.0,3.0,3.8,21.2,0.2
21,4,Kenneth Walker III (SEA),9.72,17,5.7,2,52.5,26.3,16.0,91.5,2.0,2.0,3.0,4.0,21.0,0.0
5,5,Jordan Mason (SF),8.1,0,4.9,4,74.4,18.6,22.8,111.8,1.0,0.8,1.5,1.8,14.2,0.0
7,6,Aaron Jones (MIN),7.68,1,5.0,4,72.4,18.1,16.0,80.2,0.2,0.2,4.0,4.8,35.8,0.2
6,7,Jonathan Taylor (IND),7.61,-1,4.8,4,72.6,18.2,18.0,87.2,1.0,1.0,1.5,2.5,19.2,0.0
4,8,Kyren Williams (LAR),7.55,-4,3.5,4,81.6,20.4,18.2,64.5,0.0,1.2,3.2,3.5,17.0,0.2
37,9,Joe Mixon (HOU),7.4,28,4.7,2,34.8,17.4,19.5,92.0,0.0,0.5,3.0,4.0,22.0,0.0
8,10,Jahmyr Gibbs (DET),7.09,-2,5.3,4,71.1,17.8,13.5,71.2,0.8,0.8,2.8,3.2,19.0,0.2
