In [2]:
import requests
import pandas as pd
from datetime import datetime
import os
import time

file_path = r"D:\WORKSPACE\dotabet\data\simple_matches_less2022.csv"

def fetch_matches(last_match_id=None):
    url = "https://api.opendota.com/api/proMatches"
    if last_match_id:
        url += f"?less_than_match_id={last_match_id}"
    
    max_retries = 2
    retries = 0

    while retries < max_retries:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:  # Too Many Requests
            print(f"Too Many Requests: {response.status_code} {response.text} Wait for 60 secs")
            time.sleep(60)  
            retries += 1
        elif response.status_code == 502: # Bad Gateway Error
            print(f"Bad Gateway Error: {response.status_code} {response.text} Wait for MORE secs")
            time.sleep(60 * (retries + 1))
            retries += 1
        else:
            raise Exception(f"Failed to fetch data: {response.status_code} {response.text}")

    raise Exception("API request failed after maximum retries.")

def parse_matches(matches):
    parsed_data = []
    for match in matches:
        match_data = {
            "match_id": match.get("match_id"),
            "duration": match.get("duration"),
            "start_time": datetime.fromtimestamp(match.get("start_time")).strftime('%d-%m-%Y'),
            "radiant_team_id": match.get("radiant_team_id"),
            "radiant_name": match.get("radiant_name"),
            "dire_team_id": match.get("dire_team_id"),
            "dire_name": match.get("dire_name"),
            "leagueid": match.get("leagueid"),
            "league_name": match.get("league_name"),
            "series_id": match.get("series_id", "NaN"),
            "series_type": match.get("series_type", "NaN"),
            "radiant_score": match.get("radiant_score"),
            "dire_score": match.get("dire_score"),
            "radiant_win": match.get("radiant_win"),
            "version": match.get("version", "NaN")
        }
        parsed_data.append(match_data)
    return parsed_data

def main():
    # Determine the starting last_match_id
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        last_match_id = df['match_id'].iloc[-1]
    else:
        last_match_id = None
        # Initialize an empty CSV if it doesn't exist to set headers and handle empty file scenario
        pd.DataFrame(columns=["match_id", "duration", "start_time", "radiant_team_id", "radiant_name", 
                              "dire_team_id", "dire_name", "leagueid", "league_name", "series_id", 
                              "series_type", "radiant_score", "dire_score", "radiant_win", "version"]).to_csv(file_path, index=False)

    year_reached = False

    while not year_reached:
        matches = fetch_matches(last_match_id)
        if not matches:
            break
        parsed_matches_data = parse_matches(matches)
        
        # Append new matches to DataFrame
        new_df = pd.DataFrame(parsed_matches_data)
        # Append new data to the CSV file in append mode with headers off after the first write
        new_df.to_csv(file_path, mode='a', header=False, index=False)

        # Update the last_match_id for the next API request
        last_match_id = matches[-1]["match_id"]

        # Check if the last match's year is 2023, if so, stop fetching more data
        last_match_date = datetime.fromtimestamp(matches[-1]["start_time"])
        print(f"Succesfully appended data up to {last_match_date} into csv")
        if last_match_date.year <= 2013:
            year_reached = True

if __name__ == "__main__":
    main()


Succesfully appended data up to 2014-12-09 12:07:02 into csv
Succesfully appended data up to 2014-12-06 20:31:00 into csv
Succesfully appended data up to 2014-12-04 16:01:28 into csv
Succesfully appended data up to 2014-12-02 16:14:05 into csv
Succesfully appended data up to 2014-11-30 06:09:37 into csv
Succesfully appended data up to 2014-11-28 17:55:03 into csv
Succesfully appended data up to 2014-11-26 19:13:17 into csv
Succesfully appended data up to 2014-11-24 16:20:18 into csv
Succesfully appended data up to 2014-11-23 11:56:36 into csv
Succesfully appended data up to 2014-11-22 11:03:28 into csv
Succesfully appended data up to 2014-11-20 17:02:10 into csv
Succesfully appended data up to 2014-11-18 23:17:01 into csv
Succesfully appended data up to 2014-11-17 13:40:22 into csv
Succesfully appended data up to 2014-11-16 03:29:46 into csv
Succesfully appended data up to 2014-11-14 21:05:37 into csv
Succesfully appended data up to 2014-11-13 03:10:25 into csv
Succesfully appended dat

In [20]:
nik = pd.read_csv(file_path)

In [21]:
nik['match_id'].min()

6912090798

In [17]:
list(nik['match_id'].values)[-1]

6945774688