In [42]:
import pandas as pd
import requests as re
from bs4 import BeautifulSoup
from sklearn import metrics

In [34]:
clay_df = pd.read_csv('mike clay projections 2019.csv', names=['Player', 'Clay_Projection'])
clay_df['Player'] = clay_df['Player'].map(lambda x: ''.join(i for i in x if i.isalnum() or i == ' '))

In [35]:
def get_stats(year):
    r = re.get(f'https://www.pro-football-reference.com/years/{year}/fantasy.htm')
    soup = BeautifulSoup(r.content, 'html.parser')
    table = soup.find_all('table')[0]  
    df = pd.read_html(str(table))[0]
    
    #drop top level column names (there are 2 rows of columns headers)
    df.columns = df.columns.droplevel(0)
    
    #remove bogus entires where Player is equal to 'Player'
    df = df[df['Player'] != 'Player'] 
    
    #replace nuls with 0s
    df.fillna(0, inplace=True)
    
    df = df[['Player', 'PPR']]
    df['PPR'] = df['PPR'].astype(float)
   
    #remove non letter characters from names, add the year we are predicting for (so year + 1) to differeniate between
    # players that appear more than once in this dataset
    df['Player'] = df['Player'].map(lambda x: ''.join(i for i in x if i.isalnum() or i == ' '))
        
    #take only top 300 fantasy players
    df = df.sort_values(by='PPR', ascending=False)
    df = df[:300]
    
    #remove any duplicates, there were 3 extra rows when we concatenated, luckily no one with the same name is in the top 300
    df.drop_duplicates(subset=['Player'], inplace=True)
      
    return df

In [45]:
df = get_stats(2019).merge(clay_df, on='Player', how='left')
df.dropna(inplace=True)

In [46]:
df.head(10)

Unnamed: 0,Player,PPR,Clay_Projection
0,Christian McCaffrey,471.2,339.0
1,Lamar Jackson,415.7,269.0
2,Michael Thomas,374.6,292.0
3,Dak Prescott,337.8,298.0
4,Russell Wilson,328.6,279.0
5,Deshaun Watson,321.0,329.0
6,Aaron Jones,314.8,230.0
7,Ezekiel Elliott,311.7,340.0
8,Austin Ekeler,309.0,167.0
9,Jameis Winston,305.4,272.0


In [43]:
metrics.mean_squared_error(df['PPR'], df['Clay_Projection'], squared=False)

59.764724570856195

**Mike Clay's RMSE was 59.76 for the 2019 season**