### Import libraries, .csv file, and create DataFrame 

In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv("FantasyPros_Fantasy_Football_Statistics_WR.csv")

df.head(10)

Unnamed: 0,Rank,Player,REC,TGT,YDS,Y/R,LG,20+,TD,ATT,YDS.1,TD.1,FL,G,FPTS,FPTS/G,ROST
0,1,Justin Jefferson (MIN),128.0,184.0,1809.0,14.1,64.0,49.0,8.0,4.00,24.00,1.00,-,17.0,368.6,21.7,99%
1,2,Tyreek Hill (MIA),119.0,170.0,1710.0,14.4,64.0,45.0,7.0,7.00,32.00,1.00,-,17.0,341.2,20.1,100%
2,3,Davante Adams (LV),100.0,180.0,1516.0,15.2,60.0,47.0,14.0,3.00,-1.00,-,-,17.0,335.5,19.7,100%
3,4,Stefon Diggs (BUF),110.0,156.0,1455.0,13.2,53.0,35.0,11.0,1.00,-3.00,-,-,17.0,321.2,18.9,100%
4,5,CeeDee Lamb (DAL),107.0,156.0,1359.0,12.7,39.0,30.0,9.0,10.00,47.00,-,-,17.0,301.6,17.7,100%
5,6,A.J. Brown (PHI),88.0,145.0,1496.0,17.0,78.0,45.0,11.0,-,-,-,2.00,17.0,299.6,17.6,100%
6,7,Amon-Ra St. Brown (DET),106.0,146.0,1161.0,11.0,49.0,15.0,6.0,9.00,95.00,-,-,16.0,267.6,16.7,100%
7,8,Jaylen Waddle (MIA),75.0,117.0,1356.0,18.1,84.0,41.0,8.0,3.00,26.00,-,1.00,17.0,259.2,15.2,100%
8,9,DeVonta Smith (PHI),95.0,136.0,1196.0,12.6,45.0,30.0,7.0,-,-,-,1.00,17.0,254.6,15.0,99%
9,10,Amari Cooper (CLE),78.0,132.0,1160.0,14.9,55.0,31.0,9.0,-,-,-,-,17.0,247.0,14.5,97%


### Find the column numbers (indices) in the df

In [14]:
# Enumerate the columns to get their positions (numbers) and names
column_numbers = list(enumerate(df.columns))

column_numbers

[(0, 'Rank'),
 (1, 'Player'),
 (2, 'REC'),
 (3, 'TGT'),
 (4, 'YDS'),
 (5, 'Y/R'),
 (6, 'LG'),
 (7, '20+'),
 (8, 'TD'),
 (9, 'ATT'),
 (10, 'YDS.1'),
 (11, 'TD.1'),
 (12, 'FL'),
 (13, 'G'),
 (14, 'FPTS'),
 (15, 'FPTS/G'),
 (16, 'ROST')]

### Check the data types of the columns

In [15]:
df.dtypes

Rank        int64
Player     object
REC        object
TGT        object
YDS        object
Y/R        object
LG         object
20+        object
TD         object
ATT        object
YDS.1      object
TD.1       object
FL         object
G         float64
FPTS       object
FPTS/G     object
ROST       object
dtype: object

### Convert columns with an 'object' data type to a 'float64' data type

In [16]:
# Columns with object datatype that need conversion
convert = df.select_dtypes(include=['object']).columns.tolist()

# Exclude 'Player' and 'ROST' columns as they are likely non-numeric categorical columns
convert.remove('Player')
convert.remove('ROST')

# Convert each column to float64
for col in convert:
   df[col] = df[col].str.replace(',', '').str.replace('-', '0').astype(float)

# Verify the data types
df.dtypes

Rank        int64
Player     object
REC       float64
TGT       float64
YDS       float64
Y/R       float64
LG        float64
20+       float64
TD        float64
ATT       float64
YDS.1     float64
TD.1      float64
FL        float64
G         float64
FPTS      float64
FPTS/G    float64
ROST       object
dtype: object

### Convert the relevant stats to a per game basis

In [17]:
# Define relevant columns
relevant_columns = df.columns[2:15].tolist()

# List of columns to exclude from the per-game calculation
exclude_from_per_game = ['Y/R', 'LG', 'FL', 'G', 'FPTS', 'FPTS/G']

# Convert stats to a per-game basis for only the columns not in the exclude list
for col in relevant_columns:
    if col not in exclude_from_per_game:
        df[col + '_per_game'] = (df[col] / df['G']).round(1)

# Update the relevant columns list for correlation
# It will contain original columns that were excluded from per game calculation
# and the new per game columns for the rest
relevant_columns_for_correlation = exclude_from_per_game + \
                                  [col + '_per_game' for col in relevant_columns 
                                        if col not in exclude_from_per_game]

df_average = df[['Rank', 'Player'] + relevant_columns_for_correlation].head(10)
df_average

Unnamed: 0,Rank,Player,Y/R,LG,FL,G,FPTS,FPTS/G,REC_per_game,TGT_per_game,YDS_per_game,20+_per_game,TD_per_game,ATT_per_game,YDS.1_per_game,TD.1_per_game
0,1,Justin Jefferson (MIN),14.1,64.0,0.0,17.0,368.6,21.7,7.5,10.8,106.4,2.9,0.5,0.2,1.4,0.1
1,2,Tyreek Hill (MIA),14.4,64.0,0.0,17.0,341.2,20.1,7.0,10.0,100.6,2.6,0.4,0.4,1.9,0.1
2,3,Davante Adams (LV),15.2,60.0,0.0,17.0,335.5,19.7,5.9,10.6,89.2,2.8,0.8,0.2,0.1,0.0
3,4,Stefon Diggs (BUF),13.2,53.0,0.0,17.0,321.2,18.9,6.5,9.2,85.6,2.1,0.6,0.1,0.2,0.0
4,5,CeeDee Lamb (DAL),12.7,39.0,0.0,17.0,301.6,17.7,6.3,9.2,79.9,1.8,0.5,0.6,2.8,0.0
5,6,A.J. Brown (PHI),17.0,78.0,2.0,17.0,299.6,17.6,5.2,8.5,88.0,2.6,0.6,0.0,0.0,0.0
6,7,Amon-Ra St. Brown (DET),11.0,49.0,0.0,16.0,267.6,16.7,6.6,9.1,72.6,0.9,0.4,0.6,5.9,0.0
7,8,Jaylen Waddle (MIA),18.1,84.0,1.0,17.0,259.2,15.2,4.4,6.9,79.8,2.4,0.5,0.2,1.5,0.0
8,9,DeVonta Smith (PHI),12.6,45.0,1.0,17.0,254.6,15.0,5.6,8.0,70.4,1.8,0.4,0.0,0.0,0.0
9,10,Amari Cooper (CLE),14.9,55.0,0.0,17.0,247.0,14.5,4.6,7.8,68.2,1.8,0.5,0.0,0.0,0.0


### Calculate the correlations for relevant stats for different conditions

In [18]:
# Exclude the columns from correlation calculation
columns_to_exclude_from_correlation = ['FPTS/G', 'FPTS', 'FL', 'G']
relevant_columns_for_correlation = [col for col in relevant_columns_for_correlation 
                                        if col not in columns_to_exclude_from_correlation]

# Define a function to calculate correlations for given conditions
def compute_correlations(dataframe):
    return dataframe[relevant_columns_for_correlation].corrwith(dataframe['FPTS/G'])

# Compute correlations for various conditions
correlations_all = compute_correlations(df)
correlations_fpts_nonzero = compute_correlations(df[df['FPTS/G'] > 0])
correlations_top50 = compute_correlations(df[df['Rank'] <= 50])
correlations_top25 = compute_correlations(df[df['Rank'] <= 25])

# Compile all correlations into a DataFrame for comparison
all_correlations = pd.DataFrame({
    'All Players': correlations_all,
    'FPTS > 0': correlations_fpts_nonzero,
    'Top 50 Players': correlations_top50,
    'Top 25 Players': correlations_top25
})

# Calculate the average correlation across the four conditions
all_correlations['Average'] = all_correlations.mean(axis=1)

all_correlations

Unnamed: 0,All Players,FPTS > 0,Top 50 Players,Top 25 Players,Average
Y/R,0.322589,0.099219,0.218844,-0.013598,0.156764
LG,0.682347,0.597701,0.257052,0.315828,0.463232
REC_per_game,0.968539,0.968048,0.867441,0.826356,0.907596
TGT_per_game,0.960021,0.957288,0.862712,0.832919,0.903235
YDS_per_game,0.982938,0.98043,0.950158,0.918581,0.958027
20+_per_game,0.836478,0.811394,0.713411,0.557833,0.729779
TD_per_game,0.803841,0.850717,0.701568,0.631172,0.746824
ATT_per_game,0.21104,0.201606,0.026833,0.380173,0.204913
YDS.1_per_game,0.1823,0.129997,0.005348,0.273682,0.147832
TD.1_per_game,0.22494,0.198209,0.186788,0.710844,0.330195


### Assign the weights for the relevant stats

In [19]:
# Calculate R^2 for the 'Average' correlation
all_correlations['R^2'] = all_correlations['Average'] ** 2

# Assign weights based on the given criteria
all_correlations['Weight'] = all_correlations.apply(lambda row: 1 + row['R^2'] if row['Average'] > 0.7 else 1, axis=1)

# Display the R^2 and weights for each column
weights = all_correlations[['Average', 'R^2', 'Weight']]
weights


Unnamed: 0,Average,R^2,Weight
Y/R,0.156764,0.024575,1.0
LG,0.463232,0.214584,1.0
REC_per_game,0.907596,0.82373,1.82373
TGT_per_game,0.903235,0.815834,1.815834
YDS_per_game,0.958027,0.917815,1.917815
20+_per_game,0.729779,0.532578,1.532578
TD_per_game,0.746824,0.557747,1.557747
ATT_per_game,0.204913,0.041989,1.0
YDS.1_per_game,0.147832,0.021854,1.0
TD.1_per_game,0.330195,0.109029,1.0


### Add the assigned weights to the relevant per-game stats

In [20]:
# Multiply each relevant column by its corresponding weight
for col in relevant_columns_for_correlation:
    weight = weights.loc[col, 'Weight']
    df[col + '_weighted'] = (df[col] * weight).round(1)

# Extract the weighted columns to view the results
weighted_columns = [col + '_weighted' for col in relevant_columns_for_correlation]
df_weighted = df[['Player', 'FPTS/G'] + weighted_columns]

df_weighted.head(10)

Unnamed: 0,Player,FPTS/G,Y/R_weighted,LG_weighted,REC_per_game_weighted,TGT_per_game_weighted,YDS_per_game_weighted,20+_per_game_weighted,TD_per_game_weighted,ATT_per_game_weighted,YDS.1_per_game_weighted,TD.1_per_game_weighted
0,Justin Jefferson (MIN),21.7,14.1,64.0,13.7,19.6,204.1,4.4,0.8,0.2,1.4,0.1
1,Tyreek Hill (MIA),20.1,14.4,64.0,12.8,18.2,192.9,4.0,0.6,0.4,1.9,0.1
2,Davante Adams (LV),19.7,15.2,60.0,10.8,19.2,171.1,4.3,1.2,0.2,0.1,0.0
3,Stefon Diggs (BUF),18.9,13.2,53.0,11.9,16.7,164.2,3.2,0.9,0.1,0.2,0.0
4,CeeDee Lamb (DAL),17.7,12.7,39.0,11.5,16.7,153.2,2.8,0.8,0.6,2.8,0.0
5,A.J. Brown (PHI),17.6,17.0,78.0,9.5,15.4,168.8,4.0,0.9,0.0,0.0,0.0
6,Amon-Ra St. Brown (DET),16.7,11.0,49.0,12.0,16.5,139.2,1.4,0.6,0.6,5.9,0.0
7,Jaylen Waddle (MIA),15.2,18.1,84.0,8.0,12.5,153.0,3.7,0.8,0.2,1.5,0.0
8,DeVonta Smith (PHI),15.0,12.6,45.0,10.2,14.5,135.0,2.8,0.6,0.0,0.0,0.0
9,Amari Cooper (CLE),14.5,14.9,55.0,8.4,14.2,130.8,2.8,0.8,0.0,0.0,0.0


### Define the columns to be used for the average weighted score

In [21]:
# Columns for the "average" calculation
average_columns = [
    'REC_per_game_weighted', 'TGT_per_game_weighted', 'YDS_per_game_weighted', 
    '20+_per_game_weighted','TD_per_game_weighted', 'FPTS/G'
]

# Columns for the "average2" calculation (correlation > 0.7)
columns_gt_0_7 = weights[weights['Average'] > 0.7].index.tolist()
average2_columns = [col + '_weighted' for col in columns_gt_0_7 if col + '_weighted' in df.columns]
average2_columns.append('FPTS/G')

# Display the columns used in the 'average' calculation (where correlation is > 0.7)
average2_columns

['REC_per_game_weighted',
 'TGT_per_game_weighted',
 'YDS_per_game_weighted',
 '20+_per_game_weighted',
 'TD_per_game_weighted',
 'FPTS/G']

### Calculate each player's average weighted score

In [22]:
# Calculate "average"
df['average'] = df[average_columns].mean(axis=1).round(1)

# Calculate "average2"
df['average2'] = df[average2_columns].mean(axis=1).round(1)

# Rank the 'average' and 'average2' columns with NaN handling
df['average_rank'] = df.sort_values('average', ascending=False)\
                ['average'].rank(method='first', ascending=False, na_option='bottom').astype(float)
df['average2_rank'] = df.sort_values('average2', ascending=False)\
                ['average2'].rank(method='first', ascending=False, na_option='bottom').astype(float)

# Calculate the variance in 'average'
df['variance'] = df['Rank'] - df['average_rank']

# Calculate the variance in 'average2'
df['variance2'] = df['Rank'] - df['average2_rank']

### Display and sort the results of average weighted score in descending order

In [23]:
# Display the results for 'average'
df[['Rank', 'Player', 'average', 'average_rank', 'variance']]\
    .sort_values(by='average_rank', ascending=True).head(30)    # Define the sort on this line

Unnamed: 0,Rank,Player,average,average_rank,variance
0,1,Justin Jefferson (MIN),44.1,1.0,0.0
1,2,Tyreek Hill (MIA),41.4,2.0,0.0
22,23,Cooper Kupp (LAR),39.1,3.0,20.0
2,3,Davante Adams (LV),37.7,4.0,-1.0
3,4,Stefon Diggs (BUF),36.0,5.0,-1.0
5,6,A.J. Brown (PHI),36.0,6.0,0.0
10,11,Ja'Marr Chase (CIN),34.6,7.0,4.0
46,47,DeAndre Hopkins (TEN),34.0,8.0,39.0
4,5,CeeDee Lamb (DAL),33.8,9.0,-4.0
7,8,Jaylen Waddle (MIA),32.2,10.0,-2.0


In [24]:
# Display the results for 'average2'
df[['Rank', 'Player', 'average2', 'average2_rank', 'variance2']]\
    .sort_values(by='average2_rank', ascending=True).head(30)

Unnamed: 0,Rank,Player,average2,average2_rank,variance2
0,1,Justin Jefferson (MIN),44.1,1.0,0.0
1,2,Tyreek Hill (MIA),41.4,2.0,0.0
22,23,Cooper Kupp (LAR),39.1,3.0,20.0
2,3,Davante Adams (LV),37.7,4.0,-1.0
3,4,Stefon Diggs (BUF),36.0,5.0,-1.0
5,6,A.J. Brown (PHI),36.0,6.0,0.0
10,11,Ja'Marr Chase (CIN),34.6,7.0,4.0
46,47,DeAndre Hopkins (TEN),34.0,8.0,39.0
4,5,CeeDee Lamb (DAL),33.8,9.0,-4.0
7,8,Jaylen Waddle (MIA),32.2,10.0,-2.0
