In [1]:
import cspython.scraper
import pandas as pd
import numpy as np
import uuid
import sys
sys.setrecursionlimit(5000)

In [2]:
series = cspython.scraper.scrape_series_data('BIG', '2017-01-01', '2017-02-01', verbose=False)

match 0 done
match 1 done
match 2 done
match 3 done
match 4 done
match 5 done
match 6 done
match 7 done


In [3]:
import cPickle as pkl
with open('test_pkl.pkl', 'wb') as f:
    pkl.dump(series, f)

Each element of list returned by scrape_series_data is a dictionary containing data 1 series played by the team. They are in reverse chornological order. The elements of each series dictionary are:

    url:        the url on hltv of the Overview of the entire series
    demo_url:   the url of the demo, hosted on hltv later
    stats_url:  a url containing more detailed data about the data 
                (the scrapped data from this page is contained in stats_data)
    teams:      a dataframe containing the overall stats for each team in the series
    vetos:      the vetos in chronological order
    match_info: score and map name for each match
    team_a_b:   this provides order to the teams and match_info scores 
    

In [4]:
def match_score_dataframe(series, series_id=None):
    score = pd.DataFrame(columns = ['series_id', 'match_id', 'map', 'winner', series['team_a_b'][0], 
                                    series['team_a_b'][1]], 
                         index=range(len(series['match_info'])))
    for idx, match in enumerate(series['match_info']):
        
        map_name = match['map_name']
        score_a = int(match['scores'][0])
        score_b = int(match['scores'][1])
        if score_a > score_b:
            winner = series['team_a_b'][0]
        elif score_a < score_b:
            winner = series['team_a_b'][1]
        else:
            winner = 'draw'
        match_id = uuid.uuid4()
        score.loc[idx, :]  = series_id, match_id, map_name, winner, score_a, score_b
        
        
    score.loc[:,'match_num'] = range(1,len(score) + 1)
    
    return score

In [31]:
def round_by_round_dataframe(match_stats_data, map_name, match_id, series_id):
    winners = create_winner_column(match_stats_data['team_scores'])
    team_ending_df = create_team_ending_df(match_stats_data['team_endings'], winners)
    raw = pd.concat([team_ending_df, winners], axis=1)
    final = pd.DataFrame(columns = ['map', 'round_num', 'half'], index=raw.index)
    final.loc[:,'map'] = map_name
    final.loc[:,'round_num']=raw.index
    final.ix[:15, 'half'] = 1
    final.ix[15:, 'half'] = 2
    final.loc[:, 'match_id'] = match_id
    final.loc[:, 'series_id'] = series_id
    return pd.concat([final, raw], axis=1)

In [6]:
def create_winner_column(team_scores):
    team_a = pd.Series(team_scores['team_a']).apply(lambda x: x[0])
    team_b = pd.Series(team_scores['team_b']).apply(lambda x: x[0])
    raw_scores = pd.concat([team_a, team_b], axis=1)
    raw_scores = raw_scores.loc[(raw_scores.loc[:,0]!='')|(raw_scores.loc[:,1]!=''), :]
    
    team_a = raw_scores.iloc[0,0]
    team_b = raw_scores.iloc[0,1]
    
    raw_scores = raw_scores.iloc[1:,:]
    
    winner_col = pd.DataFrame(columns=['winner', team_a + '_wins', team_b + '_wins'], index=raw_scores.index)
    w_a=0
    w_b=0
    for idx, row in raw_scores.iterrows():
        if row.iloc[0]=='':
            winner = team_b
            w_b+=1
        else:
            winner = team_a
            w_a+=1
        winner_col.loc[idx,['winner', team_a + '_wins', team_b + '_wins']] = winner, w_a, w_b
    return winner_col

def create_team_ending_df(team_endings, winners):
    team_a_name, team_b_name = winners.columns[1][:-5], winners.columns[2][:-5]
    
    team_a = pd.Series(team_endings['team_a']).apply(lambda x: x[0])
    team_b = pd.Series(team_endings['team_b']).apply(lambda x: x[0])
    raw_endings = pd.concat([team_a, team_b], axis=1)
    raw_endings = raw_endings.loc[(raw_endings.loc[:,0]!='emptyHistory')|(raw_endings.loc[:,1]!='emptyHistory'), :]
    raw_endings = raw_endings.iloc[1:,:]
    endings = pd.DataFrame(columns=['ending', 'CT', 'T', 'side_winner'], index=raw_endings.index)
    
    if (('t_win' in team_a.iloc[:15]) 
        or ('ct_win' in team_a.iloc[:15])
        or ('ct_win' in team_a.iloc[15:]) 
        or ('t_win' in team_a.iloc[15:])):
        endings.ix[:15, 'T'] = team_b_name
        endings.ix[:15, 'CT'] = team_a_name
        endings.ix[15:, 'T'] = team_a_name
        endings.ix[15:, 'CT'] = team_b_name
    else:
        endings.ix[:15, 'T'] = team_a_name
        endings.ix[:15, 'CT'] = team_b_name
        endings.ix[15:, 'T'] = team_b_name
        endings.ix[15:, 'CT'] = team_a_name
        
    endings.loc[:,'ending'] = raw_endings.apply(lambda x: x.iloc[0] if x.iloc[0] != 'emptyHistory' else x.iloc[1], axis=1)
    endings.loc[:, 'side_winner'] = endings.apply(lambda x: 'T' if x.loc['T'] == winners.loc[x.name, 'winner'] else 'CT', axis=1)
    return endings

In [7]:
def series_overview_dataframe(all_series):
    all_series_overview = pd.DataFrame(columns=['id', 'date', 'team_a', 
                                                'team_b', 'url', 'stats_url',
                                               'demo_url'])
    for s in all_series:
        demo_url = s['demo_url']
        stats_url = s['stats_url']
        url = s['url']
        date = s['stats_data'][0]['match_time']
        team_a = s['team_a_b'][0]
        team_b = s['team_a_b'][1]
        serires_uuid = uuid.uuid4()
        all_series_overview.loc[len(all_series_overview), :]= (
        serires_uuid, date, team_a, team_b, url, stats_url, demo_url )
        
    return all_series_overview
        

In [29]:
def process_scrapped(all_series):
    overview = series_overview_dataframe(all_series)
    series_data = {}
     
    for s_id, s in zip(overview.id, all_series):
        match_overview = match_score_dataframe(s, s_id)
        map_pool = s['map_pool']
        vetos = s['vetos']
        matches=[]
        for m_id, m, map_name in zip(match_overview.match_id, s['stats_data'], match_overview.map):
            df = round_by_round_dataframe(m, map_name, m_id, s_id)
            matches.append(df)
            
        series_data[s_id] = {
                      'match_overview': match_overview,
                      'map_pool': map_pool,
                      'vetos': vetos,
                      'teams': s['teams'],
                      'matches':matches}
    return overview, series_data

In [32]:
overview, series_data = process_scrapped(series)

In [34]:
overview.to_pickle('BIG_overview.pkl')

In [36]:
import cPickle as pkl
with open('BIG_Series_data.pkl', 'wb') as f:
    pkl.dump(series_data, f)

In [69]:
round_by_round_dataframe(series[0]['stats_data'][0], 'Dust')

Unnamed: 0,map,round_num,half,ending,CT,T,side_winner,winner,BIG_wins,iNation_wins
1,Dust,1,1,t_win,iNation,BIG,T,BIG,1,0
2,Dust,2,1,bomb_exploded,iNation,BIG,T,BIG,2,0
3,Dust,3,1,bomb_exploded,iNation,BIG,T,BIG,3,0
4,Dust,4,1,t_win,iNation,BIG,T,BIG,4,0
5,Dust,5,1,ct_win,iNation,BIG,CT,iNation,4,1
6,Dust,6,1,ct_win,iNation,BIG,CT,iNation,4,2
7,Dust,7,1,bomb_defused,iNation,BIG,CT,iNation,4,3
8,Dust,8,1,ct_win,iNation,BIG,CT,iNation,4,4
9,Dust,9,1,ct_win,iNation,BIG,CT,iNation,4,5
10,Dust,10,1,ct_win,iNation,BIG,CT,iNation,4,6


In [37]:
pd.__version__

u'0.18.1'

In [38]:
with open('BIG_Series_data.pkl', 'rb') as f:
    d = pkl.load(f)

In [39]:
d

{UUID('1ecd175d-500c-458e-8de3-33275bf21c00'): {'map_pool': [u'Cache',
   u'Dust2',
   u'Mirage',
   u'Nuke',
   u'Train',
   u'Cobblestone',
   u'Overpass'],
  'match_overview':                               series_id                              match_id  \
  0  1ecd175d-500c-458e-8de3-33275bf21c00  ed4f85e0-aa19-4345-906e-3166014c9927   
  1  1ecd175d-500c-458e-8de3-33275bf21c00  085ff490-959c-4bbd-ad18-7e9fdb29e6e8   
  
        map winner BIG Rogue  match_num  
  0  Mirage    BIG  16    12          1  
  1   Cache    BIG  16     6          2  ,
  'matches': [       map  round_num half                              match_id  \
   1   Mirage          1    1  ed4f85e0-aa19-4345-906e-3166014c9927   
   2   Mirage          2    1  ed4f85e0-aa19-4345-906e-3166014c9927   
   3   Mirage          3    1  ed4f85e0-aa19-4345-906e-3166014c9927   
   4   Mirage          4    1  ed4f85e0-aa19-4345-906e-3166014c9927   
   5   Mirage          5    1  ed4f85e0-aa19-4345-906e-3166014c9927   
   6   

In [40]:
overview

Unnamed: 0,id,date,team_a,team_b,url,stats_url,demo_url
0,9f427404-0a0f-42f3-a030-776cbb1ec929,2017-01-30 18:00:00,BIG,iNation,https://www.hltv.org/matches/2307616/big-vs-in...,https://www.hltv.org/stats/matches/mapstatsid/...,https://www.hltv.org/download/demo/25537
1,3081c02d-2747-4979-bec6-b835d1e69dd6,2017-01-30 17:00:00,BIG,iGame.com,https://www.hltv.org/matches/2307615/big-vs-ig...,https://www.hltv.org/stats/matches/mapstatsid/...,https://www.hltv.org/download/demo/25536
2,3fb3fbd0-0812-4513-a7cd-450c72facd47,2017-01-15 14:45:00,FlipSid3,BIG,https://www.hltv.org/matches/2307440/flipsid3-...,https://www.hltv.org/stats/matches/41029/flips...,https://www.hltv.org/download/demo/25576
3,6896338b-bacb-4405-aaf2-3b79f8b323a3,2017-01-15 11:20:00,Heroic,BIG,https://www.hltv.org/matches/2307438/heroic-vs...,https://www.hltv.org/stats/matches/41028/heroi...,https://www.hltv.org/download/demo/25575
4,f130d28e-adc9-42aa-bbed-d0b7688f474f,2017-01-14 19:30:00,Vega Squadron,BIG,https://www.hltv.org/matches/2307437/vega-squa...,https://www.hltv.org/stats/matches/41022/vega-...,https://www.hltv.org/download/demo/25573
5,1ecd175d-500c-458e-8de3-33275bf21c00,2017-01-14 13:20:00,BIG,Rogue,https://www.hltv.org/matches/2307435/big-vs-ro...,https://www.hltv.org/stats/matches/41020/big-v...,https://www.hltv.org/download/demo/25571
6,53e95bb9-91bd-42d8-b7cf-59c4bdc2293b,2017-01-13 14:30:00,FlipSid3,BIG,https://www.hltv.org/matches/2307429/flipsid3-...,https://www.hltv.org/stats/matches/mapstatsid/...,https://www.hltv.org/download/demo/25565
7,b200996b-da5f-47e0-95e9-07fc7359811a,2017-01-09 17:00:00,mousesports,BIG,https://www.hltv.org/matches/2307302/mousespor...,https://www.hltv.org/stats/matches/40917/mouse...,https://www.hltv.org/download/demo/25558
