In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Function to read, preprocess, and aggregate full season data
def read_data():
    data = pd.read_excel("WR_Data.xlsx")
    data.drop(['FL', 'ROST'], axis=1, inplace=True)
    convert = data.select_dtypes('object').columns.difference(['Player'])
    data[convert] = data[convert].apply(lambda x: pd.to_numeric(x.str.replace(',', ''), errors='coerce')).fillna(0)
    data['Rank'] = data['Rank'].astype('Int64')
    return data

# Load the full season data
df = read_data()

# Recalculate 'Rank' based on 'FPTS' in descending order
df['Rank'] = df['FPTS'].rank(ascending=False, method='min').astype('int')

# Define columns for per-game calculation
per_game = list(df.columns[2:])
exclude_per_game = ['Y/R', 'LG', 'G', 'FPTS', 'FPTS/G']

# Create per-game stats
for col in per_game:
    if col not in exclude_per_game:
        df[col + '/game'] = (df[col] / df['G']).round(1)

# Define final columns for analysis
final_columns = exclude_per_game + [col + '/game' for col in per_game if col not in exclude_per_game]

# Correlation calculation
exclude_corr = ['FPTS/G', 'FPTS', 'G']
corr_columns = [col for col in final_columns if col not in exclude_corr]

def compute_correlations(dataframe, corr_columns):
    return dataframe[corr_columns].corrwith(dataframe['FPTS/G'])

corr_all = compute_correlations(df, corr_columns)
corr_nonzero = compute_correlations(df[df['FPTS/G'] > 0], corr_columns)
corr_top50 = compute_correlations(df[df['Rank'] <= 50], corr_columns)
corr_top25 = compute_correlations(df[df['Rank'] <= 25], corr_columns)

df_corr = pd.DataFrame({
    'All Players': corr_all,
    'FPTS > 0': corr_nonzero,
    'Top 50 Players': corr_top50,
    'Top 25 Players': corr_top25
})

df_corr['Correlation'] = df_corr.mean(axis=1)
df_corr['R^2'] = df_corr['Correlation'] ** 2

high_weight_threshold = 0.5
specific_stats = df_corr[df_corr['R^2'] > high_weight_threshold].index.tolist()
exclude_threshold = 0.1
exclude_stats = df_corr[df_corr['R^2'] < exclude_threshold].index.tolist()

def weight_calc(row, specific_stats):
    if row.name in specific_stats:
        return 1 + row['R^2'] * 2
    else:
        return 1 + row['R^2']

df_corr['Weight'] = df_corr.apply(weight_calc, specific_stats=specific_stats, axis=1)
df_corr['Weight'] = df_corr['Weight'] / df_corr['Weight'].sum()

# Apply weights
for col in corr_columns:
    if col not in exclude_stats:
        weight = df_corr.loc[col, 'Weight']
        df[col + '_weighted'] = (df[col] * weight).round(1)

weight_columns = [col + '_weighted' for col in corr_columns if col not in exclude_stats]
avg = weight_columns + (['FPTS/G'] * 2)

df['Score'] = df[avg].mean(axis=1).round(2)

scaler = MinMaxScaler(feature_range=(0, 10))
df['Score'] = scaler.fit_transform(df[['Score']])

# Prepare features and target for model training
X = df[weight_columns]
y = df['Score']

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(X, y, df.index, test_size=0.2, random_state=42)

ridge_model = Ridge()
cv_scores = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
#print(f'Cross-Validation MSE: {-cv_scores.mean()}')

ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
#print(f'Test Set MSE: {mse}')

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
#print(f'Random Forest Test Set MSE: {rf_mse}')

final_pred = (y_pred + rf_pred) / 2

test_results = pd.DataFrame({'Final_Score': final_pred}, index=test_idx)
df = df.merge(test_results, how='left', left_index=True, right_index=True)
df['Final_Score'].fillna(df['Score'], inplace=True)
df['Final_Score'] = scaler.fit_transform(df[['Final_Score']]).round(2)
df['Final Rank'] = df['Final_Score'].rank(method='first', ascending=False).astype(int)
df['Variance'] = df['Rank'] - df['Final Rank']

final_columns_exclude = ['Y/R', 'LG', 'ATT/game', 'YDS.1/game', 'TD.1/game']
final_columns = [col for col in final_columns if col not in final_columns_exclude]

analysis = df[['Rank', 'Final Rank', 'Player', 'Final_Score', 'Variance'] + final_columns]
analysis.set_index('Rank', inplace=True)
analysis = analysis.sort_values(by='Final Rank', ascending=True)
#analysis.to_excel("WR_Analysis.xlsx", index=False)

analysis.head(30)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Final_Score'].fillna(df['Score'], inplace=True)


Unnamed: 0_level_0,Final Rank,Player,Final_Score,Variance,G,FPTS,FPTS/G,REC/game,TGT/game,YDS/game,20+/game,TD/game
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,1,Tyreek Hill (MIA),10.0,1,16,376.4,23.5,7.4,10.7,112.4,1.8,0.8
1,2,CeeDee Lamb (DAL),9.34,-1,17,403.2,23.7,7.9,10.6,102.9,1.7,0.7
3,3,Amon-Ra St. Brown (DET),8.81,0,16,330.9,20.7,7.4,10.2,94.7,1.5,0.6
8,4,Keenan Allen (CHI),8.8,4,13,278.9,21.5,8.3,11.5,95.6,1.5,0.5
33,5,Justin Jefferson (MIN),8.76,28,10,202.2,20.2,6.8,10.0,107.4,2.5,0.5
4,6,Puka Nacua (LAR),7.88,-2,17,298.5,17.6,6.2,9.4,87.4,1.5,0.4
12,7,Nico Collins (HOU),7.73,5,15,260.4,17.4,5.3,7.3,86.5,1.7,0.5
5,8,A.J. Brown (PHI),7.5,-3,17,289.6,17.0,6.2,9.3,85.6,1.2,0.4
6,9,DJ Moore (CHI),7.32,-3,17,286.5,16.9,5.6,8.0,80.2,1.5,0.5
11,10,Ja'Marr Chase (CIN),7.3,1,16,262.7,16.4,6.2,9.1,76.0,0.9,0.4
