### Import libraries, .csv file, and create DataFrame 

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("FantasyPros_Fantasy_Football_Statistics_WR.csv")

df.head(10)

Unnamed: 0,Rank,Player,REC,TGT,YDS,Y/R,LG,20+,TD,ATT,YDS.1,TD.1,FL,G,FPTS,FPTS/G,ROST
0,1.0,Tyreek Hill (MIA),42.0,59.0,814.0,19.4,69.0,30.0,6.0,1.0,14.0,0.0,0.0,6.0,160.8,26.8,100.0%
1,2.0,Stefon Diggs (BUF),49.0,66.0,620.0,12.7,55.0,13.0,5.0,0.0,0.0,0.0,1.0,6.0,139.0,23.2,100.0%
2,3.0,Adam Thielen (CAR),49.0,59.0,509.0,10.4,30.0,7.0,4.0,1.0,6.0,0.0,0.0,6.0,126.5,21.1,93.4%
3,4.0,Keenan Allen (LAC),42.0,55.0,519.0,12.4,42.0,13.0,4.0,2.0,6.0,0.0,0.0,5.0,124.5,24.9,99.4%
4,5.0,Ja'Marr Chase (CIN),50.0,73.0,556.0,11.1,63.0,10.0,3.0,1.0,2.0,0.0,0.0,6.0,123.8,20.6,100.0%
5,6.0,Puka Nacua (LAR),50.0,70.0,598.0,12.0,37.0,12.0,2.0,2.0,4.0,0.0,0.0,6.0,122.2,20.4,96.5%
6,7.0,A.J. Brown (PHI),42.0,60.0,672.0,16.0,59.0,20.0,2.0,0.0,0.0,0.0,0.0,6.0,121.2,20.2,100.0%
7,8.0,DJ Moore (CHI),32.0,42.0,582.0,18.2,58.0,23.0,5.0,0.0,0.0,0.0,0.0,6.0,120.2,20.0,97.6%
8,9.0,Justin Jefferson (MIN),36.0,53.0,571.0,15.9,52.0,23.0,3.0,0.0,0.0,0.0,1.0,5.0,109.1,21.8,99.4%
9,10.0,Davante Adams (LV),39.0,59.0,471.0,12.1,32.0,6.0,3.0,0.0,0.0,0.0,0.0,6.0,104.1,17.4,100.0%


### Find the column numbers (indices) in the df

In [2]:
# Enumerate the columns to get their positions (numbers) and names
column_numbers = list(enumerate(df.columns))

column_numbers

[(0, 'Rank'),
 (1, 'Player'),
 (2, 'REC'),
 (3, 'TGT'),
 (4, 'YDS'),
 (5, 'Y/R'),
 (6, 'LG'),
 (7, '20+'),
 (8, 'TD'),
 (9, 'ATT'),
 (10, 'YDS.1'),
 (11, 'TD.1'),
 (12, 'FL'),
 (13, 'G'),
 (14, 'FPTS'),
 (15, 'FPTS/G'),
 (16, 'ROST')]

### Check the data types of the columns

In [3]:
df.dtypes

Rank      float64
Player     object
REC       float64
TGT       float64
YDS       float64
Y/R       float64
LG        float64
20+       float64
TD        float64
ATT       float64
YDS.1     float64
TD.1      float64
FL        float64
G         float64
FPTS      float64
FPTS/G    float64
ROST       object
dtype: object

### Convert columns with an 'object' data type to a 'float64' data type

In [4]:
# Columns with object datatype that need conversion
convert = df.select_dtypes(include=['object']).columns.tolist()

# Exclude 'Player' and 'ROST' columns as they are likely non-numeric categorical columns
convert.remove('Player')
convert.remove('ROST')

# Convert each column to float64
for col in convert:
   df[col] = df[col].str.replace(',', '').str.replace('-', '0').astype(float)

# Verify the data types
df.dtypes

Rank      float64
Player     object
REC       float64
TGT       float64
YDS       float64
Y/R       float64
LG        float64
20+       float64
TD        float64
ATT       float64
YDS.1     float64
TD.1      float64
FL        float64
G         float64
FPTS      float64
FPTS/G    float64
ROST       object
dtype: object

### Convert the relevant stats to a per game basis

In [5]:
# Define relevant columns
relevant_columns = df.columns[2:15].tolist()

# List of columns to exclude from the per-game calculation
exclude_from_per_game = ['Y/R', 'LG', 'FL', 'G', 'FPTS', 'FPTS/G']

# Convert stats to a per-game basis for only the columns not in the exclude list
for col in relevant_columns:
    if col not in exclude_from_per_game:
        df[col + '_per_game'] = (df[col] / df['G']).round(1)

# Update the relevant columns list for correlation
# It will contain original columns that were excluded from per game calculation
# and the new per game columns for the rest
relevant_columns_for_correlation = exclude_from_per_game + \
                                  [col + '_per_game' for col in relevant_columns 
                                        if col not in exclude_from_per_game]

df_average = df[['Rank', 'Player'] + relevant_columns_for_correlation].head(10)
df_average

Unnamed: 0,Rank,Player,Y/R,LG,FL,G,FPTS,FPTS/G,REC_per_game,TGT_per_game,YDS_per_game,20+_per_game,TD_per_game,ATT_per_game,YDS.1_per_game,TD.1_per_game
0,1.0,Tyreek Hill (MIA),19.4,69.0,0.0,6.0,160.8,26.8,7.0,9.8,135.7,5.0,1.0,0.2,2.3,0.0
1,2.0,Stefon Diggs (BUF),12.7,55.0,1.0,6.0,139.0,23.2,8.2,11.0,103.3,2.2,0.8,0.0,0.0,0.0
2,3.0,Adam Thielen (CAR),10.4,30.0,0.0,6.0,126.5,21.1,8.2,9.8,84.8,1.2,0.7,0.2,1.0,0.0
3,4.0,Keenan Allen (LAC),12.4,42.0,0.0,5.0,124.5,24.9,8.4,11.0,103.8,2.6,0.8,0.4,1.2,0.0
4,5.0,Ja'Marr Chase (CIN),11.1,63.0,0.0,6.0,123.8,20.6,8.3,12.2,92.7,1.7,0.5,0.2,0.3,0.0
5,6.0,Puka Nacua (LAR),12.0,37.0,0.0,6.0,122.2,20.4,8.3,11.7,99.7,2.0,0.3,0.3,0.7,0.0
6,7.0,A.J. Brown (PHI),16.0,59.0,0.0,6.0,121.2,20.2,7.0,10.0,112.0,3.3,0.3,0.0,0.0,0.0
7,8.0,DJ Moore (CHI),18.2,58.0,0.0,6.0,120.2,20.0,5.3,7.0,97.0,3.8,0.8,0.0,0.0,0.0
8,9.0,Justin Jefferson (MIN),15.9,52.0,1.0,5.0,109.1,21.8,7.2,10.6,114.2,4.6,0.6,0.0,0.0,0.0
9,10.0,Davante Adams (LV),12.1,32.0,0.0,6.0,104.1,17.4,6.5,9.8,78.5,1.0,0.5,0.0,0.0,0.0


### Calculate the correlations for relevant stats for different conditions

In [6]:
# Exclude the columns from correlation calculation
columns_to_exclude_from_correlation = ['FPTS/G', 'FPTS', 'FL', 'G']
relevant_columns_for_correlation = [col for col in relevant_columns_for_correlation 
                                        if col not in columns_to_exclude_from_correlation]

# Define a function to calculate correlations for given conditions
def compute_correlations(dataframe):
    return dataframe[relevant_columns_for_correlation].corrwith(dataframe['FPTS/G'])

# Compute correlations for various conditions
correlations_all = compute_correlations(df)
correlations_fpts_nonzero = compute_correlations(df[df['FPTS/G'] > 0])
correlations_top50 = compute_correlations(df[df['Rank'] <= 50])
correlations_top25 = compute_correlations(df[df['Rank'] <= 25])

# Compile all correlations into a DataFrame for comparison
all_correlations = pd.DataFrame({
    'All Players': correlations_all,
    'FPTS > 0': correlations_fpts_nonzero,
    'Top 50 Players': correlations_top50,
    'Top 25 Players': correlations_top25
})

# Calculate the average correlation across the four conditions
all_correlations['Average'] = all_correlations.mean(axis=1)

all_correlations

Unnamed: 0,All Players,FPTS > 0,Top 50 Players,Top 25 Players,Average
Y/R,0.635537,0.282889,0.249528,0.267839,0.358948
LG,0.812363,0.652047,0.324482,0.246237,0.508782
REC_per_game,0.96384,0.953396,0.833826,0.788707,0.884942
TGT_per_game,0.943053,0.931442,0.747154,0.658914,0.820141
YDS_per_game,0.982126,0.978007,0.933963,0.912441,0.951634
20+_per_game,0.851262,0.829627,0.666728,0.581025,0.73216
TD_per_game,0.833102,0.808262,0.671759,0.697967,0.752773
ATT_per_game,0.111855,0.010732,-0.072266,-0.261851,-0.052882
YDS.1_per_game,0.055912,-0.036919,-0.061711,-0.181218,-0.055984
TD.1_per_game,0.078469,0.048686,-0.113648,-0.250473,-0.059242


### Assign the weights for the relevant stats

In [7]:
# Calculate R^2 for the 'Average' correlation
all_correlations['R^2'] = all_correlations['Average'] ** 2

# Assign weights based on the given criteria
all_correlations['Weight'] = all_correlations.apply(lambda row: 1 + row['R^2'] if row['Average'] > 0.7 else 1, axis=1)

# Display the R^2 and weights for each column
weights = all_correlations[['Average', 'R^2', 'Weight']]
weights


Unnamed: 0,Average,R^2,Weight
Y/R,0.358948,0.128844,1.0
LG,0.508782,0.258859,1.0
REC_per_game,0.884942,0.783123,1.783123
TGT_per_game,0.820141,0.672631,1.672631
YDS_per_game,0.951634,0.905608,1.905608
20+_per_game,0.73216,0.536059,1.536059
TD_per_game,0.752773,0.566666,1.566666
ATT_per_game,-0.052882,0.002797,1.0
YDS.1_per_game,-0.055984,0.003134,1.0
TD.1_per_game,-0.059242,0.00351,1.0


### Add the assigned weights to the relevant per-game stats

In [8]:
# Multiply each relevant column by its corresponding weight
for col in relevant_columns_for_correlation:
    weight = weights.loc[col, 'Weight']
    df[col + '_weighted'] = (df[col] * weight).round(1)

# Extract the weighted columns to view the results
weighted_columns = [col + '_weighted' for col in relevant_columns_for_correlation]
df_weighted = df[['Player', 'FPTS/G'] + weighted_columns]

df_weighted.head(10)

Unnamed: 0,Player,FPTS/G,Y/R_weighted,LG_weighted,REC_per_game_weighted,TGT_per_game_weighted,YDS_per_game_weighted,20+_per_game_weighted,TD_per_game_weighted,ATT_per_game_weighted,YDS.1_per_game_weighted,TD.1_per_game_weighted
0,Tyreek Hill (MIA),26.8,19.4,69.0,12.5,16.4,258.6,7.7,1.6,0.2,2.3,0.0
1,Stefon Diggs (BUF),23.2,12.7,55.0,14.6,18.4,196.8,3.4,1.3,0.0,0.0,0.0
2,Adam Thielen (CAR),21.1,10.4,30.0,14.6,16.4,161.6,1.8,1.1,0.2,1.0,0.0
3,Keenan Allen (LAC),24.9,12.4,42.0,15.0,18.4,197.8,4.0,1.3,0.4,1.2,0.0
4,Ja'Marr Chase (CIN),20.6,11.1,63.0,14.8,20.4,176.6,2.6,0.8,0.2,0.3,0.0
5,Puka Nacua (LAR),20.4,12.0,37.0,14.8,19.6,190.0,3.1,0.5,0.3,0.7,0.0
6,A.J. Brown (PHI),20.2,16.0,59.0,12.5,16.7,213.4,5.1,0.5,0.0,0.0,0.0
7,DJ Moore (CHI),20.0,18.2,58.0,9.5,11.7,184.8,5.8,1.3,0.0,0.0,0.0
8,Justin Jefferson (MIN),21.8,15.9,52.0,12.8,17.7,217.6,7.1,0.9,0.0,0.0,0.0
9,Davante Adams (LV),17.4,12.1,32.0,11.6,16.4,149.6,1.5,0.8,0.0,0.0,0.0


### Define the columns to be used for the average weighted score

In [9]:
# Columns for the "average" calculation
average_columns = [
    'REC_per_game_weighted', 'TGT_per_game_weighted', 'YDS_per_game_weighted', 
    '20+_per_game_weighted','TD_per_game_weighted', 'FPTS/G'
]

# Columns for the "average2" calculation (correlation > 0.7)
columns_gt_0_7 = weights[weights['Average'] > 0.7].index.tolist()
average2_columns = [col + '_weighted' for col in columns_gt_0_7 if col + '_weighted' in df.columns]
average2_columns.append('FPTS/G')

# Display the columns used in the 'average' calculation (where correlation is > 0.7)
average2_columns

['REC_per_game_weighted',
 'TGT_per_game_weighted',
 'YDS_per_game_weighted',
 '20+_per_game_weighted',
 'TD_per_game_weighted',
 'FPTS/G']

### Calculate each player's average weighted score

In [10]:
# Calculate "average"
df['average'] = df[average_columns].mean(axis=1).round(1)

# Calculate "average2"
df['average2'] = df[average2_columns].mean(axis=1).round(1)

# Rank the 'average' and 'average2' columns with NaN handling
df['average_rank'] = df.sort_values('average', ascending=False)\
                ['average'].rank(method='first', ascending=False, na_option='bottom').astype(float)
df['average2_rank'] = df.sort_values('average2', ascending=False)\
                ['average2'].rank(method='first', ascending=False, na_option='bottom').astype(float)

# Calculate the variance in 'average'
df['variance'] = df['Rank'] - df['average_rank']

# Calculate the variance in 'average2'
df['variance2'] = df['Rank'] - df['average2_rank']

### Display and sort the results of average weighted score in descending order

In [11]:
# Display the results for 'average'
df[['Rank', 'Player', 'average', 'average_rank', 'variance']]\
    .sort_values(by='average_rank', ascending=True).head(30)    # Define the sort on this line

Unnamed: 0,Rank,Player,average,average_rank,variance
0,1.0,Tyreek Hill (MIA),53.9,1.0,0.0
53,54.0,Cooper Kupp (LAR),52.5,2.0,52.0
8,9.0,Justin Jefferson (MIN),46.3,3.0,6.0
6,7.0,A.J. Brown (PHI),44.7,4.0,3.0
3,4.0,Keenan Allen (LAC),43.6,5.0,-1.0
1,2.0,Stefon Diggs (BUF),43.0,6.0,-4.0
5,6.0,Puka Nacua (LAR),41.4,7.0,-1.0
4,5.0,Ja'Marr Chase (CIN),39.3,8.0,-3.0
7,8.0,DJ Moore (CHI),38.8,9.0,-1.0
11,12.0,Amon-Ra St. Brown (DET),37.8,10.0,2.0


In [12]:
# Display the results for 'average2'
df[['Rank', 'Player', 'average2', 'average2_rank', 'variance2']]\
    .sort_values(by='average2_rank', ascending=True).head(30)

Unnamed: 0,Rank,Player,average2,average2_rank,variance2
0,1.0,Tyreek Hill (MIA),53.9,1.0,0.0
53,54.0,Cooper Kupp (LAR),52.5,2.0,52.0
8,9.0,Justin Jefferson (MIN),46.3,3.0,6.0
6,7.0,A.J. Brown (PHI),44.7,4.0,3.0
3,4.0,Keenan Allen (LAC),43.6,5.0,-1.0
1,2.0,Stefon Diggs (BUF),43.0,6.0,-4.0
5,6.0,Puka Nacua (LAR),41.4,7.0,-1.0
4,5.0,Ja'Marr Chase (CIN),39.3,8.0,-3.0
7,8.0,DJ Moore (CHI),38.8,9.0,-1.0
11,12.0,Amon-Ra St. Brown (DET),37.8,10.0,2.0
