In [43]:
import urllib2, sys
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from IPython.core.display import HTML
import requests
import time
import datetime
import urlparse
import os

In [4]:
def get_teamID(team_name):
    team_info = pd.read_csv('../../scrapped_data/team_info.csv', index_col=0)
    teamID = team_info.loc[team_info.team_name == team_name, 'team_id'].tolist()[0]
    return teamID

In [5]:
def get_soup(url):
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = urllib2.Request(url,headers=hdr)
    page = urllib2.urlopen(req)
    soup = BeautifulSoup(page)
    return soup

In [45]:
def get_match_urls(params):
    done=False
    params['offset'] = 0
    urls = []
    while not done:
        match_page = "https://www.hltv.org/results?offset={offset}&content=demo&team={teamID}&startDate={startDate}&endDate={endDate}".format(**params)
        soup = get_soup(match_page)
        matches = soup.find_all("div", class_='results-all')
        
        if len(matches) == 0:
            break
        
        results = matches[0].find_all("a", class_="a-reset")
        urls  += ['https://www.hltv.org' + result['href'] for result in results]
        if len(urls) % 100 != 0:
            done = True
        else:
            params['offset'] += 100
    del params['offset']
    return urls

In [46]:
def parse_match(site):
    soup = get_soup(site)

    demo_url = 'https://www.hltv.org' + soup.find_all("a", class_="flexbox left-right-padding")[0]['href']
    
    try:
        vetos = soup.find_all("div", class_="standard-box veto-box")[1].find_all("div")[0].find_all("div")
        vetos = [veto.text for veto in vetos]
    except:
        vetos = None
        
    stats_url = 'https://www.hltv.org' + [a_element['href'] for a_element in soup.find_all("a") if a_element.text == "Detailed stats"][0]

    tables = soup.find_all("div", class_ = "stats-content", id="all-content")[0]
    tables = tables.find_all("table")
    
    hdr = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(site, headers=hdr)
    tables = pd.read_html(r.text, header=0)
    team_a, team_b = tables[0], tables[1]

    match_data = {
        'url': site,
        'vetos': vetos,
        'teams': [team_a, team_b]
    }
    
    map_stats_url = 'https://www.hltv.org/stats/matches/mapstatsid/'+ stats_url.split('/')[5] + '/'+ stats_url.split('/')[6]
    preformance_url ='https://www.hltv.org/stats/matches/performance/mapstatsid/'+ stats_url.split('/')[5] + '/'+ stats_url.split('/')[6]
    
    stats_data = parse_stats_page(map_stats_url)
    preformance_data = parse_stats_performance_page(preformance_url)
    
    match_data.update(stats_data)
    match_data.update(preformance_data)
    
    return match_data

In [14]:
def scrape_match_data(team_name, startDate, endDate):
    teamID = get_teamID(team_name)
    params = {
        'teamID':teamID,
        'startDate':startDate,
        'endDate':endDate
    }
    urls = get_match_urls(params)
    matches = []
    for idx, url in enumerate(urls):
        matches.append(parse_match(url))
        time.sleep(5)
        print 'match {0} done'.format(idx)
    return matches

In [40]:
def parse_stats_performance_page(url): 
    #site page example "https://www.hltv.org/stats/matches/performance/mapstatsid/52325/immortals-vs-cloud9"
    # THIS THE STATS/PERFORMANCE PAGE
    hdr = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=hdr)
    tables = pd.read_html(r.text, header=0)
    total_team_kda = tables[0] 
    who_kill_who = tables[1] 
    first_kills = tables[2]  
    awp_kills = tables[3]   
    
    return {
        'total_team_kda': total_team_kda, # total kills deaths and assists of team
        'who_kill_who' : who_kill_who, # who killed who
        'first_kills' : first_kills, #first kill of the round
        'awp_kills' : awp_kills #awp kills
    }

In [41]:
def parse_stats_page(url): 
    #site page example  "https://www.hltv.org/stats/matches/mapstatsid/52325/immortals-vs-cloud9"
    # THIS IS THE STATS PAGE
    
    soup = get_soup(url)
  
    match_time = soup.find_all("div", {"class":"small-text"})
    for item in match_time:
        match_time = item.text
    match_time = datetime.datetime.strptime(match_time, '%Y-%m-%d  %H:%MMap') #match date and time
    hdr = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=hdr)
    tables = pd.read_html(r.text, header=0)
    team_a_stats, team_b_stats = tables[0], tables[1]

    round_history_team = soup.find_all("div", class_ = "round-history-team-row") # winner of rounds and how rounds were won
    round_history_team_a = round_history_team[0].find_all("img")
    round_history_team_b = round_history_team[1].find_all("img")
    team_a_scores = []
    for scoreing in round_history_team_a:
         team_a_scores.append([ scoreing.get('title')])                    #rounds that team a won
    team_b_scores = []                                                         
    for scoreing in round_history_team_b:                              
         team_b_scores.append([ scoreing.get('title')])                    #rounds that team b won
    team_a_ending = []                                                          
    for ending in round_history_team_a:
        url = urlparse.urlparse(ending.get('src'))
        base = os.path.basename(url.path)                                  #how team a won the round
        team_a_ending.append([os.path.splitext(base)[0]])   
    team_b_ending = []
    for ending in round_history_team_b:
        url = urlparse.urlparse(ending.get('src'))
        base = os.path.basename(url.path)
        team_b_ending.append([os.path.splitext(base)[0]])                  #how team b won the round
    return {
        'match_time' : match_time, #match date and time
        'team_scores': [team_a_scores, team_b_scores], #rounds that team a won
        'team_endings': [team_a_ending, team_b_ending] #how the team won the round
       }

In [47]:
#testing function

team_name = 'TyLoo'
startDate ='2017-08-01'
endDate ='2017-10-01'

matches = scrape_match_data(team_name, startDate, endDate)

['https://www.hltv.org/matches/2314604/tyloo-vs-flash-wesg-2017-china-finals']
match 0 done


In [20]:
matches

[{'stats_url': 'https://www.hltv.org/stats/matches/47764/tyloo-vs-flash',
  'teams': [                            TyLoo    K-D  +/-   ADR   KAST  Rating2.0
   0  HaoWen 'somebody' Xu  somebody  70-52   18  88.8  74.0%       1.28
   1             Zhen 'HZ' Huang  HZ  53-52    1  78.0  76.6%       1.09
   2                 Ke 'Mo' Liu  Mo  55-53    2  75.8  67.5%       1.08
   3        ZhenDong 'Not7' Mo  Not7  42-57  -15  60.4  67.5%       0.83
   4                 Hui 'DD' Wu  DD  38-59  -21  66.3  57.1%       0.75,
                                     Flash    K-D  +/-   ADR   KAST  Rating2.0
   0  YuanZhang 'AttackeR' Sheng  AttackeR  69-46   23  92.3  76.6%       1.41
   1            YuLun 'fancy1' Cai  fancy1  60-54    6  84.5  76.6%       1.23
   2           KunHua 'LOVEYY' Bai  LOVEYY  51-49    2  74.8  68.8%       1.08
   3              QiFang 'Karsa' Su  Karsa  52-54   -2  79.5  76.6%       1.05
   4           WeiJia 'INNOPY' Guo  INNOPY  41-55  -14  58.8  79.2%       0.90],
  