In [50]:
import cspython.scraper
import pandas as pd
import numpy as np
import uuid
import sys
sys.setrecursionlimit(15000)

In [2]:
series = cspython.scraper.scrape_series_data('BIG', '2017-01-01', '2017-02-01', verbose=False)

match 0 done
match 1 done
match 2 done
match 3 done
match 4 done
match 5 done
match 6 done
match 7 done


In [8]:
series[2]['team_scoreboards'][0]

[                                  FlipSid3    K-D  +/-    ADR   KAST  \
 0  Denis 'electronic' Sharipov  electronic  28-16   12  105.1  77.8%   
 1      Jan 'wayLander' Rahkonen  wayLander  26-19    7   94.7  74.1%   
 2    Yegor 'markeloff' Markelov  markeloff  18-17    1   77.4  74.1%   
 3     Georgi 'WorldEdit' Yaskin  WorldEdit  17-17    0   68.1  74.1%   
 4        Andrey 'B1ad3' Gorodenskiy  B1ad3  15-19   -4   70.9  77.8%   
 
    Rating2.0  
 0       1.60  
 1       1.43  
 2       1.10  
 3       1.08  
 4       1.00  ,
                                   FlipSid3    K-D  +/-   ADR   KAST  Rating2.0
 0  Denis 'electronic' Sharipov  electronic  26-15   11  99.3  85.2%       1.54
 1    Yegor 'markeloff' Markelov  markeloff  20-13    7  79.9  74.1%       1.25
 2        Andrey 'B1ad3' Gorodenskiy  B1ad3  21-19    2  70.2  77.8%       1.21
 3     Georgi 'WorldEdit' Yaskin  WorldEdit  22-18    4  78.1  74.1%       1.07
 4      Jan 'wayLander' Rahkonen  wayLander  14-18   -4  65.9  

Each element of list returned by scrape_series_data is a dictionary containing data 1 series played by the team. They are in reverse chornological order. The elements of each series dictionary are:

    url:        the url on hltv of the Overview of the entire series
    demo_url:   the url of the demo, hosted on hltv later
    stats_url:  a url containing more detailed data about the data 
                (the scrapped data from this page is contained in stats_data)
    teams:      a dataframe containing the overall stats for each team in the series
    vetos:      the vetos in chronological order
    match_info: score and map name for each match
    team_a_b:   this provides order to the teams and match_info scores 
    

In [42]:
def match_score_dataframe(series, series_id=None):
    score = pd.DataFrame(columns = ['series_id', 'match_id', 'map', 'winner', series['team_a_b'][0], 
                                    series['team_a_b'][1]], 
                         index=range(len(series['match_info'])))
    for idx, match in enumerate(series['match_info']):
        
        map_name = match['map_name']
        score_a = int(match['scores'][0])
        score_b = int(match['scores'][1])
        if score_a > score_b:
            winner = series['team_a_b'][0]
        elif score_a < score_b:
            winner = series['team_a_b'][1]
        else:
            winner = 'draw'
        match_id = uuid.uuid4()
        score.loc[idx, :]  = series_id, match_id, map_name, winner, score_a, score_b
        
        
    score.loc[:,'match_num'] = range(1,len(score) + 1)
    
    return score

In [43]:
def round_by_round_dataframe(match_stats_data, map_name, match_id, series_id):
    winners = create_winner_column(match_stats_data['team_scores'])
    team_ending_df = create_team_ending_df(match_stats_data['team_endings'], winners)
    raw = pd.concat([team_ending_df, winners], axis=1)
    final = pd.DataFrame(columns = ['map', 'round_num', 'half'], index=raw.index)
    final.loc[:,'map'] = map_name
    final.loc[:,'round_num']=raw.index
    final.ix[:16, 'half'] = 1
    final.ix[16:, 'half'] = 2
    final.loc[:, 'match_id'] = match_id
    final.loc[:, 'series_id'] = series_id
    return pd.concat([final, raw], axis=1)

In [44]:
def create_winner_column(team_scores):
    team_a = pd.Series(team_scores['team_a']).apply(lambda x: x[0])
    team_b = pd.Series(team_scores['team_b']).apply(lambda x: x[0])
    raw_scores = pd.concat([team_a, team_b], axis=1)
    raw_scores = raw_scores.loc[(raw_scores.loc[:,0]!='')|(raw_scores.loc[:,1]!=''), :]
    
    team_a = raw_scores.iloc[0,0]
    team_b = raw_scores.iloc[0,1]
    
    raw_scores = raw_scores.iloc[1:,:]
    
    winner_col = pd.DataFrame(columns=['winner', team_a + '_wins', team_b + '_wins'], index=raw_scores.index)
    w_a=0
    w_b=0
    for idx, row in raw_scores.iterrows():
        if row.iloc[0]=='':
            winner = team_b
            w_b+=1
        else:
            winner = team_a
            w_a+=1
        winner_col.loc[idx,['winner', team_a + '_wins', team_b + '_wins']] = winner, w_a, w_b
    return winner_col

def create_team_ending_df(team_endings, winners):
    team_a_name, team_b_name = winners.columns[1][:-5], winners.columns[2][:-5]
    
    team_a = pd.Series(team_endings['team_a']).apply(lambda x: x[0])
    team_b = pd.Series(team_endings['team_b']).apply(lambda x: x[0])
    raw_endings = pd.concat([team_a, team_b], axis=1)
    raw_endings = raw_endings.loc[(raw_endings.loc[:,0]!='emptyHistory')|(raw_endings.loc[:,1]!='emptyHistory'), :]
    raw_endings = raw_endings.iloc[1:,:]
    endings = pd.DataFrame(columns=['ending', 'CT', 'T', 'side_winner'], index=raw_endings.index)
    
    if (('t_win' in team_a.iloc[:16]) 
        or ('ct_win' in team_a.iloc[:16])
        or ('ct_win' in team_a.iloc[16:]) 
        or ('t_win' in team_a.iloc[16:])):
        endings.ix[:15, 'T'] = team_b_name
        endings.ix[:15, 'CT'] = team_a_name
        endings.ix[15:, 'T'] = team_a_name
        endings.ix[15:, 'CT'] = team_b_name
    else:
        endings.ix[:16, 'T'] = team_a_name
        endings.ix[:16, 'CT'] = team_b_name
        endings.ix[16:, 'T'] = team_b_name
        endings.ix[16:, 'CT'] = team_a_name
        
    endings.loc[:,'ending'] = raw_endings.apply(lambda x: x.iloc[0] if x.iloc[0] != 'emptyHistory' else x.iloc[1], axis=1)
    endings.loc[:, 'side_winner'] = endings.apply(lambda x: 'T' if x.loc['T'] == winners.loc[x.name, 'winner'] else 'CT', axis=1)
    return endings

In [45]:
def series_overview_dataframe(all_series):
    all_series_overview = pd.DataFrame(columns=['id', 'date', 'team_a', 
                                                'team_b', 'url', 'stats_url',
                                               'demo_url'])
    for s in all_series:
        demo_url = s['demo_url']
        stats_url = s['stats_url']
        url = s['url']
        date = s['stats_data'][0]['match_time']
        team_a = s['team_a_b'][0]
        team_b = s['team_a_b'][1]
        serires_uuid = uuid.uuid4()
        all_series_overview.loc[len(all_series_overview), :]= (
        serires_uuid, date, team_a, team_b, url, stats_url, demo_url )
        
    return all_series_overview
        

In [46]:
def process_scrapped(all_series):
    overview = series_overview_dataframe(all_series)
    series_data = {}
     
    for s_id, s in zip(overview.id, all_series):
        match_overview = match_score_dataframe(s, s_id)
        map_pool = s['map_pool']
        vetos = s['vetos']
        matches=[]
        for m_id, m, map_name in zip(match_overview.match_id, s['stats_data'], match_overview.map):
            df = round_by_round_dataframe(m, map_name, m_id, s_id)
            matches.append(df)
            
        series_data[s_id] = {
                      'match_overview': match_overview,
                      'map_pool': map_pool,
                      'vetos': vetos,
                      'scoreboards': s['team_scoreboards'],
                      'matches':matches}
    return overview, series_data

In [32]:
overview, series_data = process_scrapped(series)

In [34]:
overview.to_pickle('BIG_overview.pkl')

In [36]:
import cPickle as pkl
with open('BIG_Series_data.pkl', 'wb') as f:
    pkl.dump(series_data, f)

In [69]:
round_by_round_dataframe(series[0]['stats_data'][0], 'Dust')

Unnamed: 0,map,round_num,half,ending,CT,T,side_winner,winner,BIG_wins,iNation_wins
1,Dust,1,1,t_win,iNation,BIG,T,BIG,1,0
2,Dust,2,1,bomb_exploded,iNation,BIG,T,BIG,2,0
3,Dust,3,1,bomb_exploded,iNation,BIG,T,BIG,3,0
4,Dust,4,1,t_win,iNation,BIG,T,BIG,4,0
5,Dust,5,1,ct_win,iNation,BIG,CT,iNation,4,1
6,Dust,6,1,ct_win,iNation,BIG,CT,iNation,4,2
7,Dust,7,1,bomb_defused,iNation,BIG,CT,iNation,4,3
8,Dust,8,1,ct_win,iNation,BIG,CT,iNation,4,4
9,Dust,9,1,ct_win,iNation,BIG,CT,iNation,4,5
10,Dust,10,1,ct_win,iNation,BIG,CT,iNation,4,6


In [37]:
pd.__version__

u'0.18.1'

In [38]:
with open('BIG_Series_data.pkl', 'rb') as f:
    d = pkl.load(f)

In [17]:
test = pd.Series([['hi'] for i in range(10)])
test

0    [hi]
1    [hi]
2    [hi]
3    [hi]
4    [hi]
5    [hi]
6    [hi]
7    [hi]
8    [hi]
9    [hi]
dtype: object

In [20]:
print test[0]
print test.apply(lambda x: x[0])

['hi']
0    hi
1    hi
2    hi
3    hi
4    hi
5    hi
6    hi
7    hi
8    hi
9    hi
dtype: object


In [19]:
test.apply(lambda x: x[0])

0    hi
1    hi
2    hi
3    hi
4    hi
5    hi
6    hi
7    hi
8    hi
9    hi
dtype: object

In [40]:
overview

Unnamed: 0,id,date,team_a,team_b,url,stats_url,demo_url
0,9f427404-0a0f-42f3-a030-776cbb1ec929,2017-01-30 18:00:00,BIG,iNation,https://www.hltv.org/matches/2307616/big-vs-in...,https://www.hltv.org/stats/matches/mapstatsid/...,https://www.hltv.org/download/demo/25537
1,3081c02d-2747-4979-bec6-b835d1e69dd6,2017-01-30 17:00:00,BIG,iGame.com,https://www.hltv.org/matches/2307615/big-vs-ig...,https://www.hltv.org/stats/matches/mapstatsid/...,https://www.hltv.org/download/demo/25536
2,3fb3fbd0-0812-4513-a7cd-450c72facd47,2017-01-15 14:45:00,FlipSid3,BIG,https://www.hltv.org/matches/2307440/flipsid3-...,https://www.hltv.org/stats/matches/41029/flips...,https://www.hltv.org/download/demo/25576
3,6896338b-bacb-4405-aaf2-3b79f8b323a3,2017-01-15 11:20:00,Heroic,BIG,https://www.hltv.org/matches/2307438/heroic-vs...,https://www.hltv.org/stats/matches/41028/heroi...,https://www.hltv.org/download/demo/25575
4,f130d28e-adc9-42aa-bbed-d0b7688f474f,2017-01-14 19:30:00,Vega Squadron,BIG,https://www.hltv.org/matches/2307437/vega-squa...,https://www.hltv.org/stats/matches/41022/vega-...,https://www.hltv.org/download/demo/25573
5,1ecd175d-500c-458e-8de3-33275bf21c00,2017-01-14 13:20:00,BIG,Rogue,https://www.hltv.org/matches/2307435/big-vs-ro...,https://www.hltv.org/stats/matches/41020/big-v...,https://www.hltv.org/download/demo/25571
6,53e95bb9-91bd-42d8-b7cf-59c4bdc2293b,2017-01-13 14:30:00,FlipSid3,BIG,https://www.hltv.org/matches/2307429/flipsid3-...,https://www.hltv.org/stats/matches/mapstatsid/...,https://www.hltv.org/download/demo/25565
7,b200996b-da5f-47e0-95e9-07fc7359811a,2017-01-09 17:00:00,mousesports,BIG,https://www.hltv.org/matches/2307302/mousespor...,https://www.hltv.org/stats/matches/40917/mouse...,https://www.hltv.org/download/demo/25558


In [37]:
team_name = 'BIG'
startDate = '2017-07-19'
endDate = '2017-07-22'


bad_series = cspython.scraper.scrape_series_data(team_name, startDate, endDate, verbose=True, pkl_save=False)

https://www.hltv.org/results?offset=0&content=demo&team=7532&startDate=2017-07-19&endDate=2017-07-22


KeyboardInterrupt: 

In [38]:
bad_series = [bad_series]

In [47]:
ov, d = process_scrapped(bad_series)

In [48]:
d

{UUID('2a6ea5c6-5214-4b07-9840-963c85298ee9'): {'map_pool': [u'Cache',
   u'Mirage',
   u'Inferno',
   u'Nuke',
   u'Train',
   u'Cobblestone',
   u'Overpass'],
  'match_overview':                               series_id                              match_id  \
  0  2a6ea5c6-5214-4b07-9840-963c85298ee9  3a17c507-cd55-40ef-ba66-0959e247f718   
  1  2a6ea5c6-5214-4b07-9840-963c85298ee9  06432f41-32d3-4141-aaec-a99d41a4d303   
  2  2a6ea5c6-5214-4b07-9840-963c85298ee9  33800a4b-8bb5-4066-995e-07c74784d43d   
  
             map     winner BIG Immortals  match_num  
  0  Cobblestone        BIG  19        17          1  
  1      Inferno  Immortals   7        16          2  
  2        Train  Immortals  14        16          3  ,
  'matches': [            map  round_num half                              match_id  \
   1   Cobblestone          1    1  3a17c507-cd55-40ef-ba66-0959e247f718   
   2   Cobblestone          2    1  3a17c507-cd55-40ef-ba66-0959e247f718   
   3   Cobblestone        

In [51]:
import cPickle as pkl
with open('test_pkl.pkl', 'wb') as f:
    pkl.dump(d, f)