# Compiling different sources of sports data

In [58]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import datetime

In [1]:
#sort out team entries

## Football: Chicago Bears

In [7]:
# URL of the webpage containing the NFL 2018 season data
football = 'https://www.pro-football-reference.com/years/2018/games.htm'
response_football = requests.get(football)
if response_football.status_code == 200:
    soup = BeautifulSoup(response_football.content, 'html.parser')
    table = soup.find('table', {'id': 'games'})
    headers = [th.getText() for th in table.find('thead').findAll('th')]
    rows = []
    for row in table.find('tbody').findAll('tr'):
        rows.append([td.getText() for td in row.findAll('td')])
    nfldf = pd.DataFrame(rows, columns=headers[1:])  # Skipping the first header ('Week') which is an index
    nfldf.dropna(how='all', inplace=True)
    nfldf.reset_index(drop=True, inplace=True)
    
    # Filter rows where Chicago Bears played
    nfldf = nfldf[nfldf['Winner/tie'].str.contains('Chicago Bears', na=False) | nfldf['Loser/tie'].str.contains('Chicago Bears', na=False)]    
else:
    print(f'Failed to retrieve football data: {response.status_code}')

nfldf.head()

Unnamed: 0,Day,Date,Time,Winner/tie,Unnamed: 5,Loser/tie,Unnamed: 7,Pts,Pts.1,YdsW,TOW,YdsL,TOL
13,Sun,2018-09-09,8:20PM,Green Bay Packers,,Chicago Bears,boxscore,24,23,370,2,294,1
31,Mon,2018-09-17,8:15PM,Chicago Bears,,Seattle Seahawks,boxscore,24,17,271,2,276,2
44,Sun,2018-09-23,4:25PM,Chicago Bears,@,Arizona Cardinals,boxscore,16,14,316,2,221,4
51,Sun,2018-09-30,1:00PM,Chicago Bears,,Tampa Bay Buccaneers,boxscore,48,10,483,0,311,3
80,Sun,2018-10-14,1:00PM,Miami Dolphins,,Chicago Bears,boxscore,31,28,541,3,467,3


## Baseball: Chicago Cubs & Chicago White Sox

In [39]:
#This takes a minute unfortunately. We have to send a seperate request each time we want to know the game duration.

data = requests.get("http://statsapi.mlb.com/api/v1/schedule/games/?sportId=1&startDate=2018-03-29&endDate=2018-09-30").json()

dict_cubs = []
dict_whitesox = []
counter = 0

for datum in data["dates"]:
    for game in datum["games"]:
        name = game["teams"]["home"]["team"]["name"]
        if(name == "Chicago Cubs"):
                print("Found Cubs home game")
                gamedata_cubs = requests.get("http://statsapi.mlb.com" + game["link"]).json()    
                temp1 = [game["gameDate"],gamedata_cubs["gameData"]["gameInfo"]["gameDurationMinutes"],gamedata_cubs["gameData"]["venue"]["timeZone"]["offset"]]
                dict_cubs.append(temp1)
        if(name == "Chicago White Sox"):
            print("Found White Sox home game")
            gamedata_whitesox = requests.get("http://statsapi.mlb.com" + game["link"]).json()
            temp2 = [game["gameDate"],gamedata_whitesox["gameData"]["gameInfo"]["gameDurationMinutes"],gamedata_whitesox["gameData"]["venue"]["timeZone"]["offset"]]
            dict_whitesox.append(temp2)
            
Chicago_Cubs_temp = pd.DataFrame(dict_cubs)
Chicago_White_Sox_temp = pd.DataFrame(dict_whitesox) 

Found White Sox home game
Found White Sox home game
Found White Sox home game
Found White Sox home game
Found Cubs home game
Found White Sox home game
Found Cubs home game
Found White Sox home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found White Sox home game
Found White Sox home game
Found White Sox home game
Found White Sox home game
Found White Sox home game
Found White Sox home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found White Sox home game
Found White Sox home game
Found White Sox home game
Found White Sox home game
Found Cubs home game
Found Cubs home game
Found White Sox home game
Found White Sox home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found Cubs home game
Found W

In [36]:
#Chicago has an offset of GMT-5. We don't have to mess around with daylight saving time, because the switch does not occur during the season.
Chicago_Cubs_df = pd.DataFrame()
Chicago_Cubs_df["Game_Start"] = pd.to_datetime(Chicago_Cubs_temp[0]) - pd.to_timedelta(5, unit="h")
Chicago_Cubs_df["Game_End"] = Chicago_Cubs_df["Game_Start"] + pd.to_timedelta(Chicago_Cubs_temp[1], unit="m")

Chicago_White_Sox_df = pd.DataFrame()
Chicago_White_Sox_df["Game_Start"] = pd.to_datetime(Chicago_White_Sox_temp[0]) - pd.to_timedelta(5, unit="h")
Chicago_White_Sox_df["Game_End"] = Chicago_White_Sox_df["Game_Start"] + pd.to_timedelta(Chicago_White_Sox_temp[1], unit="m")

In [76]:
Chicago_Cubs_df

Unnamed: 0,Game_Start,Game_End
0,2018-04-09 14:20:00+00:00,2018-04-09 17:25:00+00:00
1,2018-04-10 13:20:00+00:00,2018-04-10 16:25:00+00:00
2,2018-04-11 19:05:00+00:00,2018-04-11 22:04:00+00:00
3,2018-04-12 13:20:00+00:00,2018-04-12 16:17:00+00:00
4,2018-04-13 13:20:00+00:00,2018-04-13 16:29:00+00:00
...,...,...
82,2018-09-26 19:05:00+00:00,2018-09-26 22:47:00+00:00
83,2018-09-27 19:05:00+00:00,2018-09-27 21:58:00+00:00
84,2018-09-28 13:20:00+00:00,2018-09-28 16:27:00+00:00
85,2018-09-29 12:05:00+00:00,2018-09-29 14:30:00+00:00


## Chicago Blackhawks

In [54]:
# URL of the API containing the NHL 2017/2018 season data for the Chicago Blackhawks
hockey = "https://api-web.nhle.com/v1/club-schedule-season/CHI/20172018"

response_hockey = requests.get(hockey)
data = response_hockey.json()

nhldf_temp = pd.DataFrame(data["games"])
nhldf_temp
#ES MUSS BEI ALLEN ZEITEN NOCH DER OFFSET ABGEZOGEN WERDEN !!! 

Unnamed: 0,id,season,gameType,gameDate,venue,neutralSite,startTimeUTC,easternUTCOffset,venueUTCOffset,venueTimezone,gameState,gameScheduleState,tvBroadcasts,awayTeam,homeTeam,periodDescriptor,gameOutcome,gameCenterLink
0,2017010015,20172018,1,2017-09-19,{'default': 'Nationwide Arena'},False,2017-09-19T23:00:00Z,-04:00,-04:00,US/Eastern,FINAL,OK,"[{'id': 107, 'market': 'N', 'countryCode': 'US...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'id': 29, 'placeName': {'default': 'Columbus'...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/chi-vs-cbj/2017/09/19/2017010015
1,2017010034,20172018,1,2017-09-21,{'default': 'United Center'},False,2017-09-22T00:30:00Z,-04:00,-05:00,America/Chicago,FINAL,OK,"[{'id': 312, 'market': 'N', 'countryCode': 'US...","{'id': 17, 'placeName': {'default': 'Detroit'}...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/det-vs-chi/2017/09/21/2017010034
2,2017010051,20172018,1,2017-09-23,{'default': 'United Center'},False,2017-09-24T00:30:00Z,-04:00,-05:00,America/Chicago,FINAL,OK,"[{'id': 312, 'market': 'N', 'countryCode': 'US...","{'id': 29, 'placeName': {'default': 'Columbus'...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/cbj-vs-chi/2017/09/23/2017010051
3,2017010067,20172018,1,2017-09-25,{'default': 'TD Garden'},False,2017-09-25T23:00:00Z,-04:00,-04:00,US/Eastern,FINAL,OK,"[{'id': 128, 'market': 'H', 'countryCode': 'US...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'id': 6, 'placeName': {'default': 'Boston'}, ...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/chi-vs-bos/2017/09/25/2017010067
4,2017010087,20172018,1,2017-09-28,{'default': 'Little Caesars Arena'},False,2017-09-28T23:30:00Z,-04:00,-04:00,America/Detroit,FINAL,OK,"[{'id': 107, 'market': 'N', 'countryCode': 'US...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'id': 17, 'placeName': {'default': 'Detroit'}...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/chi-vs-det/2017/09/28/2017010087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,2017021196,20172018,2,2018-03-29,{'default': 'United Center'},False,2018-03-30T00:30:00Z,-04:00,-05:00,America/Chicago,OFF,OK,"[{'id': 292, 'market': 'A', 'countryCode': 'CA...","{'id': 52, 'placeName': {'default': 'Winnipeg'...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/wpg-vs-chi/2018/03/29/2017021196
84,2017021203,20172018,2,2018-03-30,{'default': 'Pepsi Center'},False,2018-03-31T01:00:00Z,-04:00,-06:00,America/Denver,OFF,OK,"[{'id': 47, 'market': 'H', 'countryCode': 'US'...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'id': 21, 'placeName': {'default': 'Colorado'...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/chi-vs-col/2018/03/30/2017021203
85,2017021239,20172018,2,2018-04-04,{'default': 'Scottrade Center'},False,2018-04-05T00:00:00Z,-04:00,-05:00,US/Central,OFF,OK,"[{'id': 241, 'market': 'N', 'countryCode': 'US...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'id': 19, 'placeName': {'default': 'St. Louis...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/chi-vs-stl/2018/04/04/2017021239
86,2017021255,20172018,2,2018-04-06,{'default': 'United Center'},False,2018-04-07T00:30:00Z,-04:00,-05:00,America/Chicago,OFF,OK,"[{'id': 13, 'market': 'A', 'countryCode': 'US'...","{'id': 19, 'placeName': {'default': 'St. Louis...","{'id': 16, 'placeName': {'default': 'Chicago'}...","{'periodType': 'REG', 'maxRegulationPeriods': 3}",{'lastPeriodType': 'REG'},/gamecenter/stl-vs-chi/2018/04/06/2017021255


In [116]:
# Create mask for homeTeam ID
mask_home_team = nhldf_temp["homeTeam"].apply(lambda x: x['id']) == 16

# Create mask for gameDate
mask_game_date = pd.to_datetime(nhldf_temp["gameDate"]) >= pd.to_datetime("2018-01-01")

mask_combined = mask_home_team & mask_game_date

# Filter dataframe using mask
nhldf = nhldf_temp[mask_combined].copy()

#Unfortunately the API does not offer information about the duration of the game. We will use 2 hours and 19 minutes as average value.
nhldf.loc[:, 'venueUTCOffset'] = nhldf['venueUTCOffset'].apply(lambda x: int(x[:3]) + int(x[4:6])/60)
nhldf.loc[:, 'Game_Start'] = pd.to_datetime(nhldf["startTimeUTC"]) + pd.to_timedelta(nhldf["venueUTCOffset"], unit="h")
nhldf["Game_End"] = nhldf["Game_Start"] + pd.Timedelta(hours=2, minutes=19) 
nhldf[["Game_Start","Game_End"]] 

Unnamed: 0,Game_Start,Game_End
45,2018-01-05 19:30:00+00:00,2018-01-05 21:49:00+00:00
46,2018-01-07 14:00:00+00:00,2018-01-07 16:19:00+00:00
48,2018-01-10 19:00:00+00:00,2018-01-10 21:19:00+00:00
49,2018-01-12 19:30:00+00:00,2018-01-12 21:49:00+00:00
50,2018-01-14 11:30:00+00:00,2018-01-14 13:49:00+00:00
51,2018-01-20 19:30:00+00:00,2018-01-20 21:49:00+00:00
52,2018-01-22 19:30:00+00:00,2018-01-22 21:49:00+00:00
53,2018-01-24 19:00:00+00:00,2018-01-24 21:19:00+00:00
58,2018-02-06 19:30:00+00:00,2018-02-06 21:49:00+00:00
59,2018-02-08 19:30:00+00:00,2018-02-08 21:49:00+00:00


## Chicago Fire FC

In [None]:
#TODO: Scrape from https://en.wikipedia.org/wiki/2018_Chicago_Fire_season

## Chicago Bulls

In [9]:
chicago_bulls_raw = pd.read_csv(r'Chicago_Bulls.csv', index_col=None)
chicago_bulls_raw = chicago_bulls_raw.loc[chicago_bulls_raw['Unnamed: 5'] =='@']

def extract_date_basketball(row):
    date = pd.to_datetime(row['Date'] + ',' +
                          row['Start (ET)'].replace("p", "PM"),
                          format='%a %b %d %Y,%I:%M%p')
    return pd.Series(data={'date': date})
chicago_bulls = pd.DataFrame(
    chicago_bulls_raw.apply(lambda row: extract_date_basketball(row), axis=1))
chicago_bulls.head()

Unnamed: 0,date
0,2018-10-18 20:00:00
2,2018-10-22 20:30:00
4,2018-10-26 19:00:00
5,2018-10-27 19:30:00
10,2018-11-05 19:30:00
