In [None]:
from google.colab import drive
drive.mount("/content/drive")

import re
import pandas as pd
from bs4 import BeautifulSoup

Mounted at /content/drive


## Selenium Installation and Configuration

In [None]:
# install chromium, its driver, and selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
from selenium import webdriver


In [None]:
def config_wd():
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  wd = webdriver.Chrome('chromedriver',options=options)
  return wd

wd = config_wd()



## Scraping Functions

### Pulling players from leaderboards

In [None]:
def get_leaderboard_info(wd, url):
  wd.get(url)
  level_search_pattern = re.compile(r'^/images/ranks/')
  # open it, go to a website, and get results
  page_data = {}
  soup = BeautifulSoup(wd.page_source, 'html.parser')
  for profile in soup.find_all('a', {'href': re.compile(r'^/profile/')}):
    cod_level = profile.find_all("div", attrs={'class': 'shad0'})[0].text
    profile_href = profile['href']
    gaming_system, handle = profile_href.split('/')[-2:]
      
    leaderboard_rank_attrs = {'style': 'float:left;min-width:30px;text-align:center;font-weight:bold;font-size:16px;max-width:100px;width:calc(20%);'}
    leaderboard_rank = profile.find_all('div', attrs=leaderboard_rank_attrs)[0].text
    
    page_data[leaderboard_rank] = {'profile_href': profile_href
                                  ,'gaming_system': gaming_system
                                  ,'handle': handle
                                  ,'cod_level': cod_level}
  return page_data


def parse_leaderboards(start_page, end_page, base_url):
  wd = config_wd()
  leaderboard_data = []
  for i in range(start_page, end_page+1):
    url = base_url + str(i)
    leaderboard_data.append(get_leaderboard_info(wd, url))

  return leaderboard_data

### Retrieving Player Stats

In [None]:
def get_match_stats(soup):
  match_date_style = "float:left;color:#fff;padding:5 3px;font-size:10px;line-height:14px;width:34px;text-align:center;"
  match_dates = [div.text for div in soup.find_all('div', attrs={'style': match_date_style})]
  
  match_stats_raw = [list(filter(None,re.split(r'(\s)(\d+)|[%]|[a-z](\d+)', div.text))) 
                                    for div in soup.find_all('div', attrs={'class': "match_stats_adv"})]
  match_stats = [(stat_arr[0], stat_arr[-1]) 
                              if stat_arr[-1] != 'm'
                              else (stat_arr[0], stat_arr[-2]) 
                              for stat_arr in match_stats_raw]
  match_stats_labeled = {}
  start_idx = 0
  for date_idx, stop_idx in enumerate(range(10, len(match_stats), 10)):
    stats = match_stats[start_idx:stop_idx]
    match_stats_labeled[match_dates[date_idx]] = stats

    start_idx = stop_idx
  return match_stats_labeled


def get_general_stats(soup):
  for div in soup.find_all('div', attrs={"class": "content_block"}):
    names = [div.text for div in div.find_all('div', attrs={"style": "color:#fff;font-size:18px;float:left;"})]
    stats = [div.text for div in div.find_all('div', attrs={"style": "color:#52bafe;font-size:20px;float:right;font-weight:bold;"})]
    if names:
      average_performance = dict(zip(names, stats))
      return average_performance

def get_player_stats(leaderboard_handles, match_stats=True):
  
  base_path = 'https://codstats.net'
  
  match_stats_all = {}
  general_stats_all = {}
  i = 0
  wd = config_wd()
  for page in leaderboard_handles:
    for rank, player_info in page.items():
      profile_url = base_path + player_info['profile_href']
      try:
        wd.get(profile_url)
      except:
        print('error: ', profile_url)
        break
      soup = BeautifulSoup(wd.page_source, 'html.parser')
      if match_stats:
        match_stats_all[player_info['handle']] = get_match_stats(soup)
      general_stats_all[player_info['handle']] = get_general_stats(soup)
    i+=1
  return match_stats_all, general_stats_all

## Reformatting Functions

In [None]:
def reformat_leaderboard_dict(leaderboard_dict):
  player_stats = []
  cols = []
  for page in leaderboard_dict:
    for rank, player_info in page.items():
      if not cols:
        cols = player_info.keys()
      player_stats.append([rank, *player_info.values()])
  leaderboard_df = pd.DataFrame(player_stats, columns = ['rank',*cols])
  return leaderboard_df

In [None]:
def reformat_matches_dict(match_dict):
  player_stats = []
  cols = []
  for player, match_info in match_dict.items():
    for date, stats_raw in match_info.items():
      if not cols:
        cols = [s[0] for s in stats_raw]
      stats = [s[1] for s in stats_raw]
      player_stats.append([player, date, *stats])
  
  match_df = pd.DataFrame(player_stats, columns = ['handle', 'match_date']+cols)
  return match_df

In [None]:
def reformat_general_stats_dict(stat_dict):
  stat_arr =[]
  cols = []
  for handle, stats in stat_dict.items():
    if not cols:
      cols= ['handle']+list(stats.keys())
    if stats:
      stat_arr.append([handle]+ list(stats.values()))
    else:
      print(f'unable to retrieve stats for {handle}')
  
  df = pd.DataFrame(stat_arr, columns=cols)
  
  return df

Jandrew7


## Disptach and Export Functions

In [None]:
def export(general_stats_df, match_df, leaderboard_df, first_batch):
  stats_merged = general_stats_df.merge(match_df, how='outer', on=['handle'])
  full_df = leaderboard_df.merge(stats_merged, how='left', on='handle')
  
  if first_batch:
    to_csv_kwargs = {'mode' = 'w'
                    , 'header' = True}
  else:
    to_csv_kwargs = {'mode' = 'a'
                    ,'header' = 'False'}

  general_stats_df.to_csv('/content/drive/MyDrive/capstone/data/averaged_stats.csv',**to_csv_kwargs)
  full_df.to_csv('/content/drive/MyDrive/capstone/data/match_stats.csv', **to_csv_kwargs)

def dispatch(page_start, page_stop, url, first_batch = False):
  leader_board_dict = parse_leaderboards(page_start, page_stop, url)
  match_stats_dict, general_stats_dict = get_player_stats(leader_board_dict)
  
  leaderboard_df = reformat_leaderboard(leaderboard_dict)
  match_stats_df = reformat_matches(match_stats_dict)
  general_stats_df = reformat_general_stats(general_stats_dict)
  
  export(general_stats_df, match_stats_df, leaderboard_df, first_batch)