# Valorant Match Scraper Prototype
This script does the following:
1. Retrieve match URLs from the VLR website
2. Create headers for two CSV files
3. Exports match and game details into two separate CSV files

### Import Dependencies

In [None]:
from bs4 import BeautifulSoup
import requests
from csv import writer
import re
import pandas

### Champions Tour North America Stage 1: Challengers
The url in this script was manually retrieved from the VLR website.

In [None]:
url = "https://www.vlr.gg/event/matches/799/champions-tour-north-america-stage-1-challengers/?group=completed"
page = requests.get(url)

Parsing the HTML tree to get each match URL:

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')

match_urls = []
for div in soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['wf-card']):
    for link in div.find_all('a'):
        match_urls.append(link.get('href'))

# Each url in match_url contains the sub-link of each match detail

### Prepare the headers for the CSV files
This script writes the headers for each CSV file that would be produced

In [None]:
with open('matches1_NA.csv', 'w') as f:
    thewriter = writer(f)
    header = ['match_id', 'date', 'time', 'patch_number', 'team0_ID', 'team1_ID', 'team0_score', 'team1_score', 'winner_ID', 'loser_ID',
              'player0_ID', 'player1_ID', 'player2_ID', 'player3_ID', 'player4_ID',
              'player0_ACS', 'player0_K', 'player0_D', 'player0_A', 'player0_KD', 'player0_KAST', 'player0_ADR', 'player0_HS', 'player0_FK', 'player0_FD', 'player0_FKFD',
              'player1_ACS', 'player1_K', 'player1_D', 'player1_A', 'player1_KD', 'player1_KAST', 'player1_ADR', 'player1_HS', 'player1_FK', 'player1_FD', 'player1_FKFD',
              'player2_ACS', 'player2_K', 'player2_D', 'player2_A', 'player2_KD', 'player2_KAST', 'player2_ADR', 'player2_HS', 'player2_FK', 'player2_FD', 'player2_FKFD',
              'player3_ACS', 'player3_K', 'player3_D', 'player3_A', 'player3_KD', 'player3_KAST', 'player3_ADR', 'player3_HS', 'player3_FK', 'player3_FD', 'player3_FKFD',
              'player4_ACS', 'player4_K', 'player4_D', 'player4_A', 'player4_KD', 'player4_KAST', 'player4_ADR', 'player4_HS', 'player4_FK', 'player4_FD', 'player4_FKFD',
              'player5_ID', 'player6_ID', 'player7_ID', 'player8_ID', 'player9_ID',
              'player5_ACS', 'player5_K', 'player5_D', 'player5_A', 'player5_KD', 'player5_KAST', 'player5_ADR', 'player5_HS', 'player5_FK', 'player5_FD', 'player5_FKFD',
              'player6_ACS', 'player6_K', 'player6_D', 'player6_A', 'player6_KD', 'player6_KAST', 'player6_ADR', 'player6_HS', 'player6_FK', 'player6_FD', 'player6_FKFD',
              'player7_ACS', 'player7_K', 'player7_D', 'player7_A', 'player7_KD', 'player7_KAST', 'player7_ADR', 'player7_HS', 'player7_FK', 'player7_FD', 'player7_FKFD',
              'player8_ACS', 'player8_K', 'player8_D', 'player8_A', 'player8_KD', 'player8_KAST', 'player8_ADR', 'player8_HS', 'player8_FK', 'player8_FD', 'player8_FKFD',
              'player9_ACS', 'player9_K', 'player9_D', 'player9_A', 'player9_KD', 'player9_KAST', 'player9_ADR', 'player9_HS', 'player9_FK', 'player9_FD', 'player9_FKFD']
    thewriter.writerow(header)
    
with open('games1_NA.csv', 'w') as f:
    thewriter = writer(f)
    header = ['match_id', 'map', 'team0_score', 'team1_score', 'winner_ID', 'loser_ID',
              'player0_agent', 'player1_agent', 'player2_agent', 'player3_agent', 'player4_agent',
              'player0_ID', 'player1_ID', 'player2_ID', 'player3_ID', 'player4_ID',
              'player0_ACS', 'player0_K', 'player0_D', 'player0_A', 'player0_KD', 'player0_KAST', 'player0_ADR', 'player0_HS', 'player0_FK', 'player0_FD', 'player0_FKFD',
              'player1_ACS', 'player1_K', 'player1_D', 'player1_A', 'player1_KD', 'player1_KAST', 'player1_ADR', 'player1_HS', 'player1_FK', 'player1_FD', 'player1_FKFD',
              'player2_ACS', 'player2_K', 'player2_D', 'player2_A', 'player2_KD', 'player2_KAST', 'player2_ADR', 'player2_HS', 'player2_FK', 'player2_FD', 'player2_FKFD',
              'player3_ACS', 'player3_K', 'player3_D', 'player3_A', 'player3_KD', 'player3_KAST', 'player3_ADR', 'player3_HS', 'player3_FK', 'player3_FD', 'player3_FKFD',
              'player4_ACS', 'player4_K', 'player4_D', 'player4_A', 'player4_KD', 'player4_KAST', 'player4_ADR', 'player4_HS', 'player4_FK', 'player4_FD', 'player4_FKFD',
              'player5_agent', 'player6_agent', 'player7_agent', 'player8_agent', 'player9_agent',
              'player5_ID', 'player6_ID', 'player7_ID', 'player8_ID', 'player9_ID',
              'player5_ACS', 'player5_K', 'player5_D', 'player5_A', 'player5_KD', 'player5_KAST', 'player5_ADR', 'player5_HS', 'player5_FK', 'player5_FD', 'player5_FKFD',
              'player6_ACS', 'player6_K', 'player6_D', 'player6_A', 'player6_KD', 'player6_KAST', 'player6_ADR', 'player6_HS', 'player6_FK', 'player6_FD', 'player6_FKFD',
              'player7_ACS', 'player7_K', 'player7_D', 'player7_A', 'player7_KD', 'player7_KAST', 'player7_ADR', 'player7_HS', 'player7_FK', 'player7_FD', 'player7_FKFD',
              'player8_ACS', 'player8_K', 'player8_D', 'player8_A', 'player8_KD', 'player8_KAST', 'player8_ADR', 'player8_HS', 'player8_FK', 'player8_FD', 'player8_FKFD',
              'player9_ACS', 'player9_K', 'player9_D', 'player9_A', 'player9_KD', 'player9_KAST', 'player9_ADR', 'player9_HS', 'player9_FK', 'player9_FD', 'player9_FKFD']
    thewriter.writerow(header)

### Get data for each match
This script iterates over each match in the `match_urls` array, and retrieves the relevant details of that match.

In [None]:
for i in range (232, len(match_urls)):
    url = "https://www.vlr.gg" + match_urls[i]
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    # Get Date, and Patch Version contained in Match Header
    data = []
    data.append(url)
    for div in soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['match-header-date']):
        date = div.find('div').attrs['data-utc-ts']
        time = date.split()[1]
        date = date.split()[0]

        patchnum = div.find(text=re.compile('Patch')).strip()

        data.append(date)
        data.append(time)
        data.append(patchnum)

    # Get Team ID's
    teams = []
    for div in soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['match-header-vs']):
        for link in div.find_all('a', href=True):
            teams.append(link['href'])
        data.append(teams[0]) 
        data.append(teams[1])
        
    # Get Winner and Loser
    scores = []
    for div in soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['js-spoiler']):
        for span in div.find_all('span'):
            if (span.attrs['class'])[0] != 'match-header-vs-score-colon':
                scores.append((span.text).strip())
        data.append(scores[0])
        data.append(scores[1])
    
    # Error correction if scores are invalid
    if ((int(scores[0]) == int(scores[1])) or not len(scores)):
        print("Invalid scores at ", url)
        continue
    elif int(scores[0]) > int(scores[1]):
        data.append(teams[0])
        data.append(teams[1])    
    else:
        data.append(teams[1])
        data.append(teams[0])

    # Get MAPS
    maps = []
    for div in soup.find_all('div', class_='vm-stats-gamesnav-item js-map-switch'):
        r1 = re.findall(r"[A-Za-z]", div.text.strip())
        maps.append(''.join(r1))
         
    # Get all Game Tables. 1 Game Table = 1 set of 5 players.
    # One Game consists of one pair of Game Tables. 
    # The number of games in a match is determined by the (number of Game Tables/2) - 1
    match_stats = []
    match_agents = []
    for table in soup.find_all('table', class_='wf-table-inset mod-overview'):
        # Get player ID
        game_stats = []
        game_agents = []
        for player_id in table.find_all('a', href=True):
            game_stats.append(player_id['href'])
            
        # Get stats per player
        for stats in table.find_all('td', class_='mod-stat'):
            game_stats.append(stats.text.strip()) 
            
        for div in table.find_all('td', class_='mod-agents'):
            for img in div.find_all('img'):
                game_agents.append((img.attrs)['title'])
            
        match_stats.append(game_stats)
        match_agents.append(game_agents)
        
    if (not len(match_stats) or not len(match_agents)):
        print("Error at ", url)
        continue
    
    # For some reason, the second pair of tables is the Match Overview
    # Switch them up by swapping the first pair with the second one.
    match_stats[0], match_stats[1] = match_stats[2], match_stats[3]
    match_agents[2], match_agents[3] = match_agents[0], match_agents[1]
    
    
    # Get the scores of each game in the match
    game_scores = []
    for div in soup.find_all('div', class_='vm-stats-game-header'):
        for subdiv in div.find_all('div', class_='score'):
            game_scores.append(subdiv.text.strip())

    # Export match data to CSV File
    with open('matches_NA.csv', 'a') as f:
        thewriter = writer(f)
        try:
            data = data + match_stats[0] + match_stats[1]
            thewriter.writerow(data)
        except:
            break
    
    # Export each games in the match to the CSV file
    with open('games_NA.csv', 'a') as f:
        thewriter = writer(f)
        for i in range(1, int(len(match_stats)/2)):
            try:
                matchdata = []
                matchdata.append(data[0])
                matchdata.append(maps[i - 1])

                matchdata.append(int(game_scores[(i - 1)*2]))
                matchdata.append(int(game_scores[(i - 1)*2 + 1]))
                
                # Error correction for invalid game scores 
                if int(game_scores[(i - 1)*2]) == int(game_scores[(i - 1)*2 + 1]):
                    continue
                elif int(game_scores[(i - 1)*2]) > int(game_scores[(i - 1)*2 + 1]):
                    matchdata.append(teams[0])
                    matchdata.append(teams[1])
                else:
                    matchdata.append(teams[1])
                    matchdata.append(teams[0])

                matchdata = matchdata + match_agents[i*2] + match_stats[i*2] + match_agents[i*2 + 1] + match_stats[i*2 + 1]
                thewriter.writerow(matchdata)
            except:
                break