In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

### Access Site

Input credentials

In [2]:
session = requests.Session()

cred = {'gigya_UID': ...,
            'gigya_signature':..., 
          'gigya_signature_timestamp':...
          }

loginUrl = 'https://fantasy.nfl.com/auth/gigyalogin'
s = session.post(loginUrl, data=cred)

Test access

In [3]:
s = session.get('https://fantasy.nfl.com/myleagues')
soup = BeautifulSoup(s.text, 'html.parser')
if(len(soup.select(".leagueName"))>0):
    display('okay')

'okay'

### Scrape Data

We'll need to run through several pages on the site, getting all teams' data by year

In [4]:
def getTeamScheduleUrl(year, teamId):
    url = ("https://fantasy.nfl.com/league/.../history/"
            + str(year)
            + "/schedule?standingsTab=schedule&scheduleType=team&leagueId=...&scheduleDetail="
            + str(teamId))
    return url

Run through the pages. This will make up the body of our dataset

In [5]:
WeeklyResults = []
for year in range(2013, 2021):
    for i in range(20):
        teamId = i+1
        url = getTeamScheduleUrl(year, teamId)

        #get table data from page
        s = session.get(url)        
        soup = BeautifulSoup(s.text, 'html.parser')
        rows=soup.select('tbody>tr')
        
        #if there's no data to return, skip (due to years where it wasn't a full 20-team league)
        isError = soup.find_all(text="The team is not valid.")
        if(isError):
            continue
                        
        #Each row is a week of data for one team
        for j in range(len(rows)):
            week = [year, teamId]
            rowValues = rows[j].select('td')
            for rowValue in rowValues:
                
                #Scores/Result are stored in one string in 'result' class. Would rather split them
                if(rowValue.select('.result')==[]):
                    
                    #opponentId is stored in Id attribute
                    if(rowValue.select('[class*=teamId]')!=[]):
                        opponentAttr=rowValue.select('[class*=teamId]')[0].attrs['class'][1]
                        opponentId = opponentAttr.replace('teamId-','')
                        week.append(opponentId)
                    week.append(rowValue.text.strip())
                else:
                    result = rowValue.select('.result')[0]
                    for k in range(len(result.select('em'))):
                        week.append(result.select('em')[k].text)
            WeeklyResults.append(week)

### Create Field Names + Data Frame

Normally I'd scrape the columns names too but since we're adding a few of our own and editing existing names, I just decided to make it from scratch.

In [6]:
columns = ['Year', 'Team', 'Week', 'OpponentId', 'Opponent', 'PointsFor', 'PointsAgainst', 'Result']
df = pd.DataFrame(WeeklyResults, columns=columns)
df

Unnamed: 0,Year,Team,Week,OpponentId,Opponent,PointsFor,PointsAgainst,Result
0,2013,1,1,11,I have no team name,116.28,68.68,Win
1,2013,1,2,12,AP All Day,110.68,70.54,Win
2,2013,1,3,14,Put The Team On Ma' Back,103.40,72.66,Win
3,2013,1,4,16,Team Silverstein,113.98,86.48,Win
4,2013,1,5,18,THE Squad,126.42,73.30,Win
...,...,...,...,...,...,...,...,...
2599,2020,19,12,7,Mark Alexander I,92.00,82.48,Win
2600,2020,19,13,3,Moose Will Moss You,162.22,71.06,Win
2601,2020,19,14,Playoffs,,,,
2602,2020,19,15,13,So Hollywood,123.14,120.22,Win


### Limit Data set

Need to clean data a little bit. Weeks >= 15 are playoff weeks (Except 2020, where >=14). Need to limit data to regular season.
Best way to do this is probably to have a table with numbers of games per year. Then join to data frame. But I'd rather just remove the records from the df.

In [7]:
df['Week']=pd.to_numeric(df['Week'])
isPlayoffs = ((df.Week>=15) | ((df.Year==2020) & (df['Week']>=14)))
df = df[~isPlayoffs]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Year,Team,Week,OpponentId,Opponent,PointsFor,PointsAgainst,Result
0,2013,1,1,11,I have no team name,116.28,68.68,Win
1,2013,1,2,12,AP All Day,110.68,70.54,Win
2,2013,1,3,14,Put The Team On Ma' Back,103.40,72.66,Win
3,2013,1,4,16,Team Silverstein,113.98,86.48,Win
4,2013,1,5,18,THE Squad,126.42,73.30,Win
...,...,...,...,...,...,...,...,...
2137,2020,19,9,2,AWON BOYS,93.40,73.98,Win
2138,2020,19,10,15,Team on ma back,65.92,86.22,Loss
2139,2020,19,11,18,SUDS,136.08,99.72,Win
2140,2020,19,12,7,Mark Alexander I,92.00,82.48,Win


### Convert types

In [8]:
df['OpponentId'] = df['OpponentId'].astype(int)
df['PointsFor'] = df['PointsFor'].astype(float)
df['PointsAgainst'] = df['PointsAgainst'].astype(float)

### Recreate Breakdown Stat
##### For a given week, if each team played everyone else in the league, what would be their record?

In [9]:
all_weekly_results = []
for year in range(df['Year'].min(), df['Year'].max()+1):
    first_week = df[df['Year']==year]['Week'].min()
    last_week = df[df['Year']==year]['Week'].max()
    for week in range(first_week, last_week+1):
        weekly_results = df[(df['Year']==year) & (df['Week']==week)]

        win_vs_league_arr = []
        loss_vs_league_arr = []
        tie_vs_league_arr = []

        for team in weekly_results['Team']:
            team_points = weekly_results[weekly_results['Team']==team]['PointsFor'].item()
            all_teams_points = weekly_results[['Team','PointsFor']]

            would_have_beat_others = (team_points > all_teams_points['PointsFor']) & (all_teams_points['Team']!=team)
            would_have_lost_others = (team_points < all_teams_points['PointsFor']) & (all_teams_points['Team']!=team)
            would_have_tied_others = (team_points == all_teams_points['PointsFor']) & (all_teams_points['Team']!=team)
            
            win_vs_league = sum(would_have_beat_others)
            loss_vs_league = sum(would_have_lost_others)
            tie_vs_league = sum(would_have_tied_others)

            win_vs_league_arr.append(win_vs_league)
            loss_vs_league_arr.append(loss_vs_league)
            tie_vs_league_arr.append(tie_vs_league)
    
        weekly_results['win_vs_league'] = win_vs_league_arr
        weekly_results['loss_vs_league'] = loss_vs_league_arr
        weekly_results['tie_vs_league'] = tie_vs_league_arr
        
        all_weekly_results.append(weekly_results)

### Combine results into new dataframe

In [10]:
for i in range(len(all_weekly_results)):
    if(i==0):
        final_df = all_weekly_results[i]
    else:
        final_df = final_df.append(all_weekly_results[i])
        
final_df.head()

Unnamed: 0,Year,Team,Week,OpponentId,Opponent,PointsFor,PointsAgainst,Result,win_vs_league,loss_vs_league,tie_vs_league
0,2013,1,1,11,I have no team name,116.28,68.68,Win,17,2,0
14,2013,2,1,4,Kukere Kings,96.22,100.66,Loss,14,5,0
28,2013,3,1,7,Defending Champions,144.6,105.38,Win,19,0,0
42,2013,4,1,2,AWON BOYS,100.66,96.22,Win,15,4,0
56,2013,5,1,20,grown up Cubs,67.38,46.72,Win,2,17,0


In [11]:
final_df.to_csv('WTTData.csv', index=False)