# Data Extraction from the Data Golf API

This notebook pulls from the [data golf](https://datagolf.com/) API and writes the data to csv files for analysis/model building

It creates two files:
1.  `pre_tourney_snapshot.csv` for each weekly event
2. `historical_round_scores.csv` for past scoring data
    
For now we will be focusing on `PGA` tour events, but the below classes are built to accept other tours.

## Table of Contents

* [Current Tournament Field](#currentTournamentField)
    - Pulls the golfers entered in to the upcoming tournament
* [Historical Tournament Data](#historicalTournamentData)
    - Pulls historical round scoring and betting odds per golfer

In [3]:
import pandas as pd
import numpy as np
import requests
import json
import os
from tqdm.notebook import tqdm
from functools import reduce
from datetime import datetime
import warnings 
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
%load_ext dotenv
%dotenv

from config import API_PATH, tours, config
from utils import unpack_json_from_api, create_rolling_agg_features_by_golfer

MY_API_KEY = os.getenv('MY_API_KEY')

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


## Current Tournament Field  <a class="anchor" id="currentTournamentField"></a>
Extract the current tournament field and relevant details for each golfer

In [4]:
class CurrentTourneyDataGolfExtractor:
    def __init__(self,
                 tour,
                 config=config,
                 api_path=API_PATH,
                 api_key=MY_API_KEY,
                 odds_format='percent',
                ):
        """
        Class for extracting and merging data
        related to current tournaments
        https://datagolf.com/api-access
        
        Use:
        dg_current_extractor = CurrentTourneyDataGolfExtractor('pga')
        dg_current_extractor.pull_and_merge_dfs()
        
        Args:
            tour (str): PGA','OPP','EUR','KFT',
            config (dict, optional): stores path and field structures
            api_path (str, optional): base path for the api end point
            api_key (str, optional): key to access the api
            odds_form (str, optional): form to display the odds
        """
        self.tour = tour
        self.config = config
        self.api_path = api_path
        self.api_key = api_key
        self.odds_format = odds_format
    
    def get_field_updates(self, config_key):
        """
        Returns the golfers that are in the field along with their
        respective daily fantasy salaries
        
        Data corresponds to: https://datagolf.com/field-updates
        """
        path = self.config[config_key].get('path')
        end_point = f'{self.api_path}{path}?tour={self.tour}&key={self.api_key}'
        data, df = unpack_json_from_api(end_point, 'field')
        for col in data.keys():
            if col != 'field':
                df[col] = data[col]
                
        return df[self.config[config_key]['fields']]
    
    def get_rankings(self, config_key):
        """
        Returns the top 500 players in the current DG rankings,
        along with each player's skill estimate and respective OWGR rank
        
        Data corresponds to: https://datagolf.com/datagolf-rankings
        """
        path = self.config[config_key].get('path')
        end_point = f'{self.api_path}{path}?&key={self.api_key}'
        df = unpack_json_from_api(end_point, 'rankings')[1]
        
        return df[config[config_key]['fields']]
    
    def get_pre_tourney_preds(self, config_key):
        """
        Returns full-field probabilistic forecasts for the upcoming tournament.
        Probabilities provided for various finish positions (make cut, top 20, top 5, win).
        
        Data corresponds to: https://datagolf.com/pga-tour-predictions
        """
        path = self.config[config_key].get('path')
        end_point = f'{self.api_path}{path}?tour={self.tour}&odds_format={self.odds_format}&key={self.api_key}'
        response = requests.get(end_point)
        data = response.json()

        model_dict = {}
        for model in data['models_available']:
            df = pd.DataFrame(data[model])
            pred_cols = ['make_cut','top_10','top_20','top_5','win']
            df.rename(
                columns={x: x + f'_{model}' for x in pred_cols}, inplace=True
            )
            
            model_dict[model] = df
            
        if len(model_dict.keys()) > 1:
            
            cols_to_use = [col for col in model_dict['baseline_history_fit'].columns if 'history' in col] + ['dg_id']
        
            df = (model_dict['baseline']
                  .merge(model_dict['baseline_history_fit'][cols_to_use],
                         how='left',
                         on='dg_id'
                        )
                 )
        
        else:
            df = model_dict['baseline']
    
        return df[config[config_key]['fields']]
    
    def get_player_skill_decomps(self, config_key):
        """
        Returns a detailed breakdown of every player's strokes-gained prediction.
        
        Data corresponds to: https://datagolf.com/player-skill-decomposition
        """
        path = self.config[config_key].get('path')
        end_point = f'{self.api_path}{path}?tour={self.tour}&key={self.api_key}'
        df = unpack_json_from_api(end_point, 'players')[1]
        
        return df[config[config_key]['fields']]
    
    def get_current_odds(self, config_key, market, book_lst):
        """
        Returns the most recent win, top 5, top 10, top 20, make/miss cut, and first round leader odds
        for different sportsbooks
        
        Data corresponds to: https://datagolf.com/betting-tool-finish
        """
        path = self.config[config_key].get('path')
        end_point = f'https://feeds.datagolf.com/{path}?tour={self.tour}&market={market}&key={self.api_key}'
        df = unpack_json_from_api(end_point, 'odds')[1]
        
        return df[config[config_key]['fields'] + book_lst]
    
    def pull_and_merge_dfs(self):
        """
        Pulls data using the methods defined above
        Merges them together into a single DataFrame
        Returns that DataFrame
        """
        field_updates = self.get_field_updates('field_updates')
        rankings = self.get_rankings('rankings')
        pre_tourney_preds = self.get_pre_tourney_preds('pre_tourney_preds')
        skill_decomps = self.get_player_skill_decomps('skill_decomps')
        odds = self.get_current_odds('odds', 'win', ['bet365'])
        
        dfs=[field_updates,
             rankings,
             pre_tourney_preds,
             skill_decomps,
             odds,
            ]
        
        merged_dfs = reduce(lambda left, right: pd.merge(left, right, on='dg_id', how='left'), dfs)
        
        return merged_dfs

In [5]:
#Pull and merge the data into a single DataFrame
dg_current_extractor = CurrentTourneyDataGolfExtractor('pga')
dg_current = dg_current_extractor.pull_and_merge_dfs()

#Write the data to a csv
#Using a new file name for each tournament
event_name = '_'.join(dg_current['event_name'][0].split()).lower()
year = datetime.now().strftime('%Y')
fn = f'{year}_{event_name}_pre_tourney_snapshot'
dg_current.to_csv(f'../data/{fn}.csv', index=False)

## Historical Tournament Data  <a class="anchor" id="historicalTournamentData"></a>
Extract past historical round scoring data and betting odds each golfer

In [7]:
class HistoricalDataGolfExtractor:
    def __init__(self,
                 tour,
                 config=config,
                 api_path=API_PATH,
                 api_key=MY_API_KEY,
                 odds_format='percent',
                ):
            self.tour = tour
            self.config = config
            self.api_path = api_path
            self.api_key = api_key
            self.odds_format = odds_format  
    """
    Class for extracting historical round scoring data
    https://datagolf.com/api-access
    
    Use:
    ```
    dg_hist_extractor = HistoricalDataGolfExtractor(tour='pga')
    
    #Pull list of events
    events = dg_hist_extractor.get_historical_event_ids('historical_event_lst')
    
    Then use list of events to pull round data
    dg_hist_extractor.get_historical_round_data('historical_round_data', events)
    
    and
    
    odds data
    dg_hist_extractor.get_historical_odds('historical_odds', events, 'win', 'bet365')
    
    ```
        
    Args:
        tour (str): PGA','OPP','EUR','KFT',
        config (dict, optional): stores path and field structures
        api_path (str, optional): base path for the api end point
        api_key (str, optional): key to access the api
        odds_form (str, optional): form to display the odds
    """
    
    def get_historical_event_ids(self, config_key):
        """
        Returns the list of tournaments (and corresponding IDs) that are available through
        the historical raw data API endpoint
        """
        path = self.config[config_key].get('path')
        end_point = f'{self.api_path}{path}?&key={self.api_key}'
        df = (unpack_json_from_api(end_point)[1]
              .query(f"tour == '{self.tour}'")
              .reset_index(drop=True)
             )
        
        df = df[config[config_key]['fields']]
        return list(zip(df.calendar_year, df.event_id))
    
    def get_historical_round_data(self, config_key, events):
        """
        Returns round-level scoring and strokes gained data
        
        Data corresponds to: https://datagolf.com/raw-data-archive
        """
        path = self.config[config_key].get('path')
        event_cols = ['round','year','event_id','event_completed']
        round_cols = [f'round_{round_n}' for round_n in range(1,5)]
        round_lst = []
        for event in tqdm(list(events)):
            year, event_id = event[0], event[1]
            end_point = f'https://feeds.datagolf.com/{path}?tour={self.tour}&event_id={event_id}&year={year}&key={self.api_key}'
            df = unpack_json_from_api(end_point)[1]
            for idx in range(df.shape[0]):
                player_scores = df['scores'][idx]
                player_info = pd.DataFrame({k: v for k, v in player_scores.items() if k not in round_cols}, index=[0])

                for r in round_cols:
                    round_data = player_scores.get(r)
                    if round_data is not None:
                        round_df = pd.DataFrame(round_data, index=[0])
                        round_df['round'] = r
                        round_df['year'] = year
                        round_df['event_id'] = event_id
                        round_df['event_completed'] = df['event_completed'][0]
                        round_df['event_name'] =  df['event_name'][0]
                        
                        round_lst.append(pd.concat([player_info, round_df], axis = 1))
        
        return pd.concat(round_lst).reset_index(drop=True)
                        
    def get_historical_odds(self, config_key, events, market, book):
        """
        Returns opening and closing lines in various markets (win, top 5, make cut, etc.) at 11 sportsbooks
        
        Data corresponds to: https://datagolf.com/outright-odds-archive
        """
        path = self.config[config_key].get('path')
        odds_lst = []
        #Doesn't have odd data prior to 2019
        for event in tqdm([x for x in events if x[0] >= 2019]):
            year, event_id = event[0], event[1]
            end_point = f'https://feeds.datagolf.com/{path}?tour={self.tour}&event_id={event_id}&year={year}&market={market}&book={book}&key={self.api_key}'
            response = requests.get(end_point)
            if response.status_code == 200:
                df = pd.DataFrame(response.json())
                for idx in range(df.shape[0]):
                    player_odds = pd.DataFrame(df['odds'][idx], index = [0])
                    player_odds['year'] = year
                    player_odds['event_id'] = event_id
                    
                    odds_lst.append(player_odds)
            else:
                print(f"Bad response for {event}")
                
        hist_odd_df = pd.concat(odds_lst).reset_index(drop=True)
        return hist_odd_df[config[config_key]['fields']]

In [8]:
#Pull historical data for the PGA
dg_hist_extractor = HistoricalDataGolfExtractor(tour='pga')

#Get list of events
events = dg_hist_extractor.get_historical_event_ids('historical_event_lst')

#Get historical scores by round per player
round_data = dg_hist_extractor.get_historical_round_data('historical_round_data', events)

#Get betting odds per tournmanet
#Not all events have odds data so will get some bad responses
odds = dg_hist_extractor.get_historical_odds('historical_odds', events, 'win', 'bet365')

#Merge round data and odds data
historical_round_scores = round_data.merge(odds, how='left',  on=['dg_id','year','event_id'])

#Write data to csv
historical_round_scores.to_csv('../data/historical_round_scores.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=213.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=121.0), HTML(value='')))

Bad response for (2020, 472)
Bad response for (2020, 483)
Bad response for (2019, 478)
Bad response for (2019, 528)
Bad response for (2019, 472)
Bad response for (2019, 518)
Bad response for (2019, 483)
Bad response for (2019, 2)
Bad response for (2019, 6)
Bad response for (2019, 16)

