In [2]:
import requests
from bs4 import BeautifulSoup

# Accessing Webpage #

The following snippet uses requests to grap the HTML page from the internet

In [7]:
# https://realpython.com/python-web-scraping-practical-introduction/
from requests import get
from requests.exceptions import RequestException
from contextlib import closing

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

Gain access to daily fantasy on yahoo

We expected the webpage in MLB to appear as follows

<img src="mlbsample.png">


In [8]:
raw_html = simple_get('https://sports.yahoo.com/dailyfantasy/mlb')

In [9]:
raw_html



Lets try and get the list of current contests

Looking at the webpage, we will assume the following:

* Data lives in a table
* We currently only expect to see one table
* We want to access information in the column: contest. This column has the attribute data-tst with value contest-row-contest-name
* The link to the contest should inside a <\a> tag element

In [23]:
# search of the table containing the data
html = BeautifulSoup(raw_html, 'html.parser')
total_tables = 0
found_table = None
for table in html.select('table'):
    # we expect table to be as follows:
    # <table><caption id='contestsList-caption'/>[DATA]</table>
    cap = table.select('caption')[0]
    total_tables += 1
    if cap.get('id') == 'contestsList-caption':
        found_table = table

assert total_tables == 1, 'Currently ony expect a single table'
assert found_table is not None, 'Error, the structure of the yahoo table might have changed. Is the caption still present?'

In [55]:
data = []
for i, row in enumerate(found_table.select('tr')):
    print('Row #' + str(i))
    if row.get('data-tst') != 'contest-row':
        print('skipping row in table. did not equal contest-row')
        print('Row #' + str(i))
        continue
    found_col = False
    # loop through each row in the table, identify the contest column    
    for col in row.select('th'):
        # loop through each column in a row
        if col.get('data-tst') == 'contest-row-contest-name':
            found_col = True
            essential_col = col
            break
    if found_col is False:
        print('Error, we found a contest row but we could not find a th column that referred to the contest-name')
        print('Row #' + str(i))
        continue
    
    for span in essential_col.select('span'):
        if span.text:
            data.append({'row': i, 'text': span.text})
            break

Row #0
skipping row in table. did not equal contest-row
Row #0
Row #1
Row #2
Row #3
Row #4
Row #5
Row #6
Row #7
Row #8
Row #9
Row #10
Row #11
Row #12
Row #13
Row #14
Row #15


In [54]:
data

[{'row': 1, 'text': 'MLB $6K Guaranteed [$1K to 1st]'},
 {'row': 2, 'text': 'MLB $2K Guaranteed [$200 to 1st]'},
 {'row': 3, 'text': 'MLB $500 Guaranteed [$100 to 1st]'},
 {'row': 4, 'text': 'MLB $300 Guaranteed [Single Entry]'},
 {'row': 5, 'text': 'MLB $300 Guaranteed [Single Entry]'},
 {'row': 6, 'text': 'MLB $300 Guaranteed [Single Entry]'},
 {'row': 7, 'text': 'MLB $300 Guaranteed [Single Entry]'},
 {'row': 8, 'text': 'MLB $2.5K Guaranteed [Single Entry]'},
 {'row': 9, 'text': 'MLB $1.5K Guaranteed [Single Entry]'},
 {'row': 10, 'text': 'MLB $1K Guaranteed [Single Entry]'},
 {'row': 11, 'text': 'MLB $1K Guaranteed [Single Entry]'},
 {'row': 12, 'text': 'MLB $100 Guaranteed [$10 to 1st]'},
 {'row': 13, 'text': 'MLB $100 Guaranteed [No Veterans]V'},
 {'row': 14, 'text': 'MLB $200 Guaranteed [Single Entry]'},
 {'row': 15, 'text': 'MLB $200 Guaranteed [Single Entry]'}]

We are able to extract information from the first 15 rows above. But we cannot access any more rows. This must be due to infinit scrolling. How do we account for this? 

**The main challenge here is trying to identify the XHR/AJAX/JAVASCRIPT call that queries an API/gets a response to automtically fill in the table**

The following website, [infinite scroll](https://blog.michaelyin.info/how-crawl-infinite-scrolling-pages-using-python/) helps to explain how we can identify the proper script

Essentially we did the following:

1) Analyzed the webpage in yahoo

2) Selected the 'network' tab

3) Selected XHR within the network tab

4) in the right pane we saw the JSON output that was returned from each XHR requst

5) We selected the XHR tab that returned the MLB table

<img src='mblXHRRequest.png'/>

# Generating TABLE from JAVASCRIPT SOURCE 

Using the above table, we know that the URL request source is: https://dfyql-ro.sports.yahoo.com/v2/contestsFilteredWeb?lang=en-US&region=US&device=desktop&sport=mlb&sortAsc=false

In [79]:
# access webpage of tables using requests
# the response should be in json format
# from scrapy.http.request import Request
import json
table_source_url = 'https://dfyql-ro.sports.yahoo.com/v2/contestsFilteredWeb?lang=en-US&region=US&device=desktop&sport=mlb&sortAsc=false'
# req = Request(url=table_source_url, callback=lambda r: print(r))
r = requests.get(table_source_url)
table_response = json.loads(r.text)
table_data = table_response['contests']['result']
import pandas as pd
all_data_table = pd.DataFrame(table_data)
all_data_table.head()

Unnamed: 0,batchContestCount,earnableRewardPoints,entryCount,entryLimit,guaranteed,iconUrl,id,multipleEntry,multipleEntryLimit,opponentExperience,...,salaryCap,scope,seriesId,sportCode,startTime,state,subleague,subleagueDisplayName,title,type
0,1,5,48,1364,True,,2719291,True,10,,...,200,guaranteed,6499,mlb,1532559900000,upcoming,NONE,,MLB $6K Guaranteed [$1K to 1st],league
1,1,1,132,2272,True,,2719290,True,10,,...,200,guaranteed,6499,mlb,1532559900000,upcoming,NONE,,MLB $2K Guaranteed [$200 to 1st],league
2,1,0,143,2274,True,,2719298,True,10,,...,200,guaranteed,6499,mlb,1532559900000,upcoming,NONE,,MLB $500 Guaranteed [$100 to 1st],league
3,1,1,12,341,True,,2719292,False,1,,...,200,guaranteed,6499,mlb,1532559900000,upcoming,NONE,,MLB $300 Guaranteed [Single Entry],league
4,1,2,4,171,True,,2719293,False,1,,...,200,guaranteed,6499,mlb,1532559900000,upcoming,NONE,,MLB $300 Guaranteed [Single Entry],league


# Contest info

Next we need to access the contest for each row. The link to the contest should be identifiable from the [id] column

Enter contest button = '/dailyfantasy/contest/[id]/setlineup'

In order to get the list of players and their info ina  specific contest we will do the following:

* Get the contest id for each row in the contest table
* for each id, generate the url that can request all player info within a contest of interest
* using the player info url in each contest, query for all player info and return a dataframe

In [96]:
def get_contest_info(id):
    # get the url for setting a lineup in yahoo
    url = 'https://sports.yahoo.com/dailyfantasy/contest/{id}/setlineup'.format(id=id)
    return url

def get_players_info_url(id):
    # get a list of all players in a contest. This url shoudl link to the request page that returns JSON structure of all players in a contest
    url = 'https://dfyql-ro.sports.yahoo.com/v2/contestPlayers?lang=en-US&region=US&device=desktop&contestId={id}'.format(id=id)
    return url

def get_player_info(url):
    # query url and return a dataframe of all players in a contest of interest
    r = requests.get(url)
    table_response_p = json.loads(r.text)
    player_table_data = table_response_p['players']['result']
    return pd.DataFrame(player_table_data)

all_data_table['url'] = all_data_table['id'].apply(get_contest_info)
all_data_table['player_in_contest_url'] = all_data_table['id'].apply(get_players_info_url)

In [99]:
get_player_info(all_data_table.iloc[0]['player_in_contest_url']).set_index('lastName').loc['Sale']

code                                                           mlb.p.8780
eligiblePositions                                                     [P]
fantasyPointsHistory    [37.3, 42.7, 38.6, 44.3, 46.5, 34.5, 31.7, 35....
fantasyPointsPerGame                                              31.6095
fantasyPointsStdDev                                                 10.02
firstName                                                           Chris
fppgHistory             [37.3, 42.7, 38.6, 44.3, 46.5, 34.5, 31.7, 35....
game                    {'rawStatus': '7:05 pm ET', 'startTime': 15325...
imageUrl                https://s.yimg.com/bt/api/res/1.2/bsvAHx6mumdw...
jerseyNumber                                                           41
largeImageUrl           https://s.yimg.com/bt/api/res/1.2/hEETTfrxbURT...
lineupOrder                                                          None
locked                                                               True
noteFreshness                         