### Import libraries and create DataFrame from csv 

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Function to read, preprocess, and aggregate full season data
def read_data():
    # Read the full season data
    data = pd.read_excel("RB_Data_2025.xlsx")

    # Preprocess full season data, drop irrelevant columns and convert data types
    data.drop(['FL', 'ROST'], axis=1, inplace=True)
    convert = data.select_dtypes('object').columns.difference(['Player'])
    data[convert] = data[convert].apply(lambda x: pd.to_numeric(x.str.replace(',', ''), errors='coerce')).fillna(0)
    data['Rank'] = data['Rank'].astype('Int64')

    return data

# Load the full season data
df = read_data()

# Recalculate 'Rank' based on 'FPTS' in descending order
df['Rank'] = df['FPTS'].rank(ascending=False, method='min').astype('int')

# Sort the DataFrame by 'Rank'
df = df.sort_values(by='Rank')
df.head(20)

Unnamed: 0,Rank,Player,ATT,YDS,Y/A,LG,20+,TD,REC,TGT,YDS.1,Y/R,TD.1,G,FPTS,FPTS/G
0,1,Jahmyr Gibbs (DET),250,1412,5.6,70,13,16,52,63,517,9.9,4,17,362.9,21.3
1,2,Saquon Barkley (PHI),345,2005,5.8,72,17,13,33,43,278,8.4,2,16,355.3,22.2
2,3,Bijan Robinson (ATL),304,1456,4.8,37,5,14,61,72,431,7.1,1,17,341.7,20.1
3,4,Derrick Henry (BAL),325,1921,5.9,87,19,16,19,22,193,10.2,2,17,336.4,19.8
4,5,De'Von Achane (MIA),203,907,4.5,61,5,6,78,87,592,7.6,6,17,299.9,17.6
5,6,Josh Jacobs (GB),301,1329,4.4,38,7,15,36,43,342,9.5,1,17,293.1,17.2
6,7,Kyren Williams (LAR),316,1299,4.1,30,2,14,34,40,182,5.4,2,16,272.1,17.0
7,8,James Cook (BUF),207,1009,4.9,65,6,16,32,38,258,8.1,2,16,266.7,16.7
8,9,Alvin Kamara (NO),228,950,4.2,24,3,6,68,89,543,8.0,2,14,265.3,19.0
9,10,Chase Brown (CIN),229,990,4.3,40,8,7,54,65,360,6.7,4,16,255.0,15.9


### Convert relevant stats to a per-game basis and create the final stats for analysis

In [44]:
# Define columns for per-game calculation
per_game = list(df.columns[2:])

# Define columns to exclude from the per-game calculation
exclude_per_game = ['Y/A', 'LG', 'Y/R', 'G', 'FPTS', 'FPTS/G']

# Base columns that will get a /game version
base_cols = [c for c in per_game if c not in exclude_per_game]
per_game_cols = [f'{c}/game' for c in base_cols]

# Create per-game columns (safe divide if G can be 0)
for col in base_cols:
    df[f'{col}/game'] = df[col].div(df['G'].replace(0, np.nan)).round(1).fillna(0)

# Keep using THIS for downstream analysis (unchanged behavior)
final_columns = exclude_per_game + per_game_cols

# Separate list just for showing original next to per-game
display_cols = []
for col in per_game:
    if col in exclude_per_game:
        if col in df.columns:
            display_cols.append(col)                 # keep excluded as-is
    else:
        display_cols.extend([col, f'{col}/game'])    # interleave original + per-game

# Display: originals next to per-game
df[['Rank', 'Player'] + display_cols].head(10)

# # Create columns and convert stats to a per-game basis for the defined columns, not those in the exclude list
# for col in per_game:
#     if col not in exclude_per_game:
#         df[col + '/game'] = (df[col] / df['G']).round(1)

# # Define final columns to be used for analysis, combining excluded and new per-game columns
# final_columns = exclude_per_game + [col + '/game' for col in per_game if col not in exclude_per_game]

# # Display final columns with 'Rank' and 'Player'
# df[['Rank', 'Player'] + final_columns].head(10)

Unnamed: 0,Rank,Player,ATT,ATT/game,YDS,YDS/game,Y/A,LG,20+,20+/game,...,TGT,TGT/game,YDS.1,YDS.1/game,Y/R,TD.1,TD.1/game,G,FPTS,FPTS/G
0,1,Jahmyr Gibbs (DET),250,14.7,1412,83.1,5.6,70,13,0.8,...,63,3.7,517,30.4,9.9,4,0.2,17,362.9,21.3
1,2,Saquon Barkley (PHI),345,21.6,2005,125.3,5.8,72,17,1.1,...,43,2.7,278,17.4,8.4,2,0.1,16,355.3,22.2
2,3,Bijan Robinson (ATL),304,17.9,1456,85.6,4.8,37,5,0.3,...,72,4.2,431,25.4,7.1,1,0.1,17,341.7,20.1
3,4,Derrick Henry (BAL),325,19.1,1921,113.0,5.9,87,19,1.1,...,22,1.3,193,11.4,10.2,2,0.1,17,336.4,19.8
4,5,De'Von Achane (MIA),203,11.9,907,53.4,4.5,61,5,0.3,...,87,5.1,592,34.8,7.6,6,0.4,17,299.9,17.6
5,6,Josh Jacobs (GB),301,17.7,1329,78.2,4.4,38,7,0.4,...,43,2.5,342,20.1,9.5,1,0.1,17,293.1,17.2
6,7,Kyren Williams (LAR),316,19.8,1299,81.2,4.1,30,2,0.1,...,40,2.5,182,11.4,5.4,2,0.1,16,272.1,17.0
7,8,James Cook (BUF),207,12.9,1009,63.1,4.9,65,6,0.4,...,38,2.4,258,16.1,8.1,2,0.1,16,266.7,16.7
8,9,Alvin Kamara (NO),228,16.3,950,67.9,4.2,24,3,0.2,...,89,6.4,543,38.8,8.0,2,0.1,14,265.3,19.0
9,10,Chase Brown (CIN),229,14.3,990,61.9,4.3,40,8,0.5,...,65,4.1,360,22.5,6.7,4,0.2,16,255.0,15.9


### Calculate the correlations for the final stats across different conditions

In [45]:
# Define columns and columns to exclude for correlation calculation, excluding FPTS and FPTS/G
exclude_corr = ['FPTS/G', 'FPTS', 'G', 'LG']
corr_columns = [col for col in final_columns if col not in exclude_corr]

# Define a function to calculate correlations
def compute_correlations(dataframe, corr_columns):
    return dataframe[corr_columns].corrwith(dataframe['FPTS/G'])

# Compute correlations for various conditions
corr_all = compute_correlations(df, corr_columns)
corr_nonzero = compute_correlations(df[df['FPTS/G'] > 0], corr_columns)
corr_top50 = compute_correlations(df[df['Rank'] <= 50], corr_columns)
corr_top25 = compute_correlations(df[df['Rank'] <= 25], corr_columns)

# Compile all correlations into a DataFrame for comparison
df_corr = pd.DataFrame({
    'All Players': corr_all,
    'FPTS > 0': corr_nonzero,
    'Top 50 Players': corr_top50,
    'Top 25 Players': corr_top25
})

# Calculate the average correlation across the four conditions, adding 'Average' column to DataFrame
df_corr['Correlation'] = df_corr.mean(axis=1)

# Display the correlation DataFrame
df_corr.round(2)

Unnamed: 0,All Players,FPTS > 0,Top 50 Players,Top 25 Players,Correlation
Y/A,0.66,0.34,0.24,0.63,0.47
Y/R,0.56,0.22,0.05,0.26,0.27
ATT/game,0.95,0.92,0.85,0.6,0.83
YDS/game,0.96,0.94,0.86,0.75,0.88
20+/game,0.79,0.75,0.65,0.58,0.69
TD/game,0.89,0.87,0.82,0.65,0.81
REC/game,0.89,0.83,0.55,0.07,0.58
TGT/game,0.89,0.82,0.55,0.06,0.58
YDS.1/game,0.87,0.8,0.53,0.17,0.59
TD.1/game,0.59,0.48,0.2,0.11,0.35


### Assign the weights for the final stats

In [46]:
# Calculate R^2 for the 'Average' correlation, adding 'R^2' column to DataFrame
df_corr['R^2'] = df_corr['Correlation'] ** 2

# Select stats with R^2 above a threshold for higher weight
high_weight_threshold = 0.5
specific_stats = df_corr[df_corr['R^2'] > high_weight_threshold].index.tolist()

# Select stats with R^2 below a threshold to exclude from final score
exclude_threshold = 0.1
exclude_stats = df_corr[df_corr['R^2'] < exclude_threshold].index.tolist()

# Define the calculation to assign weights
def weight_calc(row, specific_stats):
    if row.name in specific_stats:
        return 1 + row['R^2'] * 2 # Assign higher weight to specific stats
    else:
        return 1 + row['R^2'] # Assign weight to all other stats
    
# Assign weights based on the given criteria, adding 'Weight' column to DataFrame
df_corr['Weight'] = df_corr.apply(weight_calc, specific_stats=specific_stats, axis=1)

# Normalize weights
df_corr['Weight'] = df_corr['Weight'] / df_corr['Weight'].sum()

# Display the new columns in the DataFrame
df_corr[['Correlation', 'R^2', 'Weight']].round(2)

Unnamed: 0,Correlation,R^2,Weight
Y/A,0.47,0.22,0.08
Y/R,0.27,0.07,0.07
ATT/game,0.83,0.69,0.15
YDS/game,0.88,0.77,0.16
20+/game,0.69,0.48,0.09
TD/game,0.81,0.65,0.14
REC/game,0.58,0.34,0.08
TGT/game,0.58,0.34,0.08
YDS.1/game,0.59,0.35,0.08
TD.1/game,0.35,0.12,0.07


### Multiply the assigned weights to the final stats and calculate the score

In [47]:
# Multiply each relevant column by its corresponding weight
for col in corr_columns:
    if col not in exclude_stats:
        weight = df_corr.loc[col, 'Weight']
        df[col + '_weighted'] = (df[col] * (1 + weight)).round(1)

# Extract the weighted columns
weight_columns = [col + '_weighted' for col in corr_columns if col not in exclude_stats]

# Add FPTS/G and FPTS as weighted columns
# df['FPTS/G_weighted'] = (df['FPTS/G'] * 1).round(1)
df['FPTS_weighted'] = (df['FPTS'] * .25).round(1)

# Define select columns to be used for the average weighted score
avg = weight_columns + ['FPTS_weighted', 'FPTS/G']

# # Display the new weighted stats columns
# df_weight = df[['Rank', 'Player', 'FPTS/G'] + weight_columns]

# # Define select columns to be used for the average weighted score
# avg = weight_columns + (['FPTS/G'] * 2)  # Giving higher weight to FPTS/G

# Calculate the average weighted score for the select columns
df['Score'] = df[avg].mean(axis=1).round(2)

# # Normalize the scores to be out of 10
scaler = MinMaxScaler(feature_range=(0, 10))
# df['Score'] = scaler.fit_transform(df[['Score']])

# Display the weighted pieces + the raw avg and final score
df_weight = df[['Rank', 'Player', 'Score', 'G', 'FPTS', 'FPTS_weighted', 'FPTS/G'] + weight_columns] \
              .sort_values(by='Score', ascending=False)

df_weight.head(30)

Unnamed: 0,Rank,Player,Score,G,FPTS,FPTS_weighted,FPTS/G,Y/A_weighted,ATT/game_weighted,YDS/game_weighted,20+/game_weighted,TD/game_weighted,REC/game_weighted,TGT/game_weighted,YDS.1/game_weighted,TD.1/game_weighted
1,2,Saquon Barkley (PHI),28.48,16,355.3,88.8,22.2,6.2,24.8,145.0,1.2,0.9,2.3,2.9,18.9,0.1
3,4,Derrick Henry (BAL),25.47,17,336.4,84.1,19.8,6.3,21.9,130.8,1.2,1.0,1.2,1.4,12.4,0.1
0,1,Jahmyr Gibbs (DET),24.86,17,362.9,90.7,21.3,6.0,16.9,96.2,0.9,1.0,3.4,4.0,32.9,0.2
2,3,Bijan Robinson (ATL),24.32,17,341.7,85.4,20.1,5.2,20.5,99.1,0.3,0.9,3.9,4.5,27.5,0.1
11,12,Jonathan Taylor (IND),22.1,14,244.7,61.2,17.5,5.1,24.8,118.3,0.9,0.9,1.4,2.4,10.5,0.1
8,9,Alvin Kamara (NO),22.01,14,265.3,66.3,19.0,4.5,18.7,78.6,0.2,0.5,5.3,6.9,42.0,0.1
5,6,Josh Jacobs (GB),21.3,17,293.1,73.3,17.2,4.7,20.3,90.5,0.4,1.0,2.3,2.7,21.8,0.1
6,7,Kyren Williams (LAR),20.43,16,272.1,68.0,17.0,4.4,22.7,94.0,0.1,1.0,2.3,2.7,12.4,0.1
4,5,De'Von Achane (MIA),20.21,17,299.9,75.0,17.6,4.8,13.7,61.8,0.3,0.5,5.0,5.5,37.7,0.4
16,17,Joe Mixon (HOU),19.85,14,240.5,60.1,17.2,4.4,20.1,84.0,0.7,0.9,2.8,4.0,24.0,0.1


### Model training

In [48]:
# # Prepare features and target for model training
# X = df[weight_columns]
# y = df['Score']

# Build leakage-free feature set: raw stats you used to INFORM the score
# (i.e., the unweighted versions), plus availability signals.
base_feature_cols = [c for c in corr_columns if c in df.columns]  # raw per-game (ideally already shrunk)
extra_cols = [c for c in ['FPTS/G', 'FPTS', 'G'] if c in df.columns]
model_cols = base_feature_cols + extra_cols

X = df[model_cols].copy()
y = df['Score']            # or df['Final_Score'] if you prefer the post-ensemble target

# Handle missing values by imputing with mean
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(X, y, df.index, test_size=0.2, random_state=42)

# Initialize Ridge regression model
ridge_model = Ridge()

# Cross-validation to evaluate the model
cv_scores = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-Validation MSE: {-cv_scores.mean()}')

# Train the model
ridge_model.fit(X_train, y_train)

# Predict and evaluate on the test set
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Test Set MSE: {mse}')

# Incorporate Random Forest as an ensemble method
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
print(f'Random Forest Test Set MSE: {rf_mse}')

# Average predictions from both models for final score
final_pred = (y_pred + rf_pred) / 2

# Create a DataFrame for the test set predictions
test_results = pd.DataFrame({'Final_Score': final_pred}, index=test_idx)

# Merge the test set predictions back into the original DataFrame
df = df.merge(test_results, how='left', left_index=True, right_index=True)

# Fill NaN values in 'Final_Score' column with the original 'Score' to handle missing indices
df['Final_Score'].fillna(df['Score'], inplace=True)

# Normalize final scores to be out of 10
df['Final_Score'] = scaler.fit_transform(df[['Final_Score']]).round(2)

# Rank the final scores
df['Final Rank'] = df['Final_Score'].rank(method='first', ascending=False).astype(int)

# Calculate the variance in ranking both ranks
df['Variance'] = df['Rank'] - df['Final Rank']

Cross-Validation MSE: 8.990060410246948e-05
Test Set MSE: 0.00012440605502097966
Random Forest Test Set MSE: 0.4440845724528306


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Final_Score'].fillna(df['Score'], inplace=True)


### Display final results and export to Excel

In [49]:
# List of columns to be excluded
final_columns_exclude = ['Y/R', 'LG']

# # Exclude the specified columns from final_columns
# final_columns = [col for col in final_columns if col not in final_columns_exclude]

# # Create final analysis columns
# analysis = df[['Rank', 'Final Rank', 'Player', 'Final_Score', 'Variance'] + final_columns]
# analysis.set_index('Rank', inplace=True)
# analysis = analysis.sort_values(by='Final Rank', ascending=True)

# Build export columns by filtering the interleaved list you created earlier
# (display_cols already has [original, original/game] pairs in the right order)
export_cols = [c for c in display_cols if c not in set(final_columns_exclude) and c in df.columns]

# If you want to pin certain stats at the top in pairs, do it here (optional):
pinned = ['G', 'FPTS', 'FPTS/G']
export_cols = pinned + [c for c in export_cols if c not in pinned]
# pinned_pairs = [x for p in pinned for x in (p, f'{p}/game') if x in export_cols]
# export_cols = pinned_pairs + [c for c in export_cols if c not in pinned_pairs]

# Build the final analysis view using the interleaved columns
analysis = df[['Rank', 'Final Rank', 'Player', 'Final_Score', 'Score', 'Variance'] + export_cols] \
            .set_index('Rank') \
            .sort_values(by='Final Rank', ascending=True)

# Export to Excel
analysis.to_excel("RB_Analysis.xlsx", index=True)

# Display the top 30 rows
analysis.head(30)

Unnamed: 0_level_0,Final Rank,Player,Final_Score,Score,Variance,G,FPTS,FPTS/G,ATT,ATT/game,...,TD,TD/game,REC,REC/game,TGT,TGT/game,YDS.1,YDS.1/game,TD.1,TD.1/game
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1,Saquon Barkley (PHI),10.0,28.48,1,16,355.3,22.2,345,21.6,...,13,0.8,33,2.1,43,2.7,278,17.4,2,0.1
4,2,Derrick Henry (BAL),8.94,25.47,2,17,336.4,19.8,325,19.1,...,16,0.9,19,1.1,22,1.3,193,11.4,2,0.1
1,3,Jahmyr Gibbs (DET),8.73,24.86,-2,17,362.9,21.3,250,14.7,...,16,0.9,52,3.1,63,3.7,517,30.4,4,0.2
3,4,Bijan Robinson (ATL),8.54,24.32,-1,17,341.7,20.1,304,17.9,...,14,0.8,61,3.6,72,4.2,431,25.4,1,0.1
12,5,Jonathan Taylor (IND),7.76,22.1,7,14,244.7,17.5,303,21.6,...,11,0.8,18,1.3,31,2.2,136,9.7,1,0.1
9,6,Alvin Kamara (NO),7.73,22.01,3,14,265.3,19.0,228,16.3,...,6,0.4,68,4.9,89,6.4,543,38.8,2,0.1
6,7,Josh Jacobs (GB),7.48,21.3,-1,17,293.1,17.2,301,17.7,...,15,0.9,36,2.1,43,2.5,342,20.1,1,0.1
7,8,Kyren Williams (LAR),7.38,20.43,-1,16,272.1,17.0,316,19.8,...,14,0.9,34,2.1,40,2.5,182,11.4,2,0.1
5,9,De'Von Achane (MIA),7.1,20.21,-4,17,299.9,17.6,203,11.9,...,6,0.4,78,4.6,87,5.1,592,34.8,6,0.4
17,10,Joe Mixon (HOU),6.98,19.85,7,14,240.5,17.2,245,17.5,...,11,0.8,36,2.6,52,3.7,309,22.1,1,0.1
