In [2]:
import sys, pdb, warnings, scipy, matplotlib, sklearn

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cPickle as pkl
import seaborn as sns

from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import cross_validation #might be model_selection <--- this is the new one
from sklearn.preprocessing import Normalizer
from sklearn import preprocessing 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFECV

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
sys.setrecursionlimit(15000)
%matplotlib inline

print('scipy: {}'.format(scipy.__version__))
print('numpy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
print('matplotlib: {}'.format(matplotlib.__version__)) 
print('sklearn: {}'.format(sklearn.__version__))

#our modules see: CS_Project/cspython directory
from cspython.scraper import modifiedSoup
from cspython.data_processing import process_scrapped
import cspython.analysis as a

#import xlrd
#import xgboost as xgb



scipy: 0.19.1
numpy: 1.13.3
pandas: 0.22.0
matplotlib: 2.1.0
sklearn: 0.19.1


# Purpose of this notebook: Create a dataset of aggregate team statistics with a resolution of 1 row per team per match

   ## There are 4 sections:
       1 Data Processing functions, which complete the processing step of the raw scraped data
       2 Data Agggregation functions, which aggregate the data to the team level for each match
       3 Data set creation: calls the functions from the previous sections to create the data
           note: the data are temporaly related, the statistics in every row are based on the 
           current match, and all previous matches (for which we have data)
       4 Create X y functions, which will be used to create test train sets for cross validation
    

## 1. Data Processing functions

    creates a giant dataframe containing all of the information scraped by the scraper

In [88]:
def combine_dfs(big_data, overview):
    first=True
    cols = ['map','round_num','half','match_id','series_id','ending','CT','T','side_winner','winner','team_A','team_B','team_A_score','team_B_score','match_num','team_players','K-D','+/-','ADR','KAST','Rating2.0','nicknames']
    dfs = []
    for idx, series_data in big_data.iteritems():
        series_data_m = merge_matches(series_data)
        series_data_m["date"] = overview.loc[overview.id == idx, 'date']
        series_data_mo = merge_overview(series_data_m, series_data)
        series_data_mos = merge_scoreboards(series_data_mo, series_data)
        series_data_mosm = match_data_board_changer(series_data_mos, series_data)
        dfs.append(series_data_mosm)    
        new_cols = list(set(series_data_mosm.columns) -set(cols))
        cols += new_cols
    data = pd.concat(dfs)
    data = data.loc[:,cols]
    data = data.reset_index()
    return data

def merge_matches(series_data):
    for d in range(0,len(series_data['matches'])):
        if d == 0:
            series_data['matches'][d] = series_data['matches'][d].rename(index = str, columns={ series_data['matches'][d].columns[10] : "team_A", series_data['matches'][d].columns[11] : "team_B" })
            series_data_m = series_data['matches'][d]
        else:
            series_data['matches'][d] = series_data['matches'][d].rename(index = str, columns={ series_data['matches'][d].columns[10] : "team_A", series_data['matches'][d].columns[11] : "team_B" })
            series_data_m = pd.concat([series_data_m, series_data['matches'][d]])
    
    return series_data_m
    #should work for concact the matches together
    
def merge_overview(series_data_m, series_data):
    series_data['match_overview']['team_A_name'] = series_data['match_overview'].columns[4]
    series_data['match_overview']['team_B_name'] = series_data['match_overview'].columns[5]
    
    series_data['match_overview'].loc[(series_data['match_overview']['winner'] == series_data['match_overview'].columns[4]),'loser_of_match'] = series_data['match_overview'].team_B_name
    series_data['match_overview'].loc[(series_data['match_overview']['winner'] != series_data['match_overview'].columns[4]),'loser_of_match'] = series_data['match_overview'].team_A_name
    series_data['match_overview'] = series_data['match_overview'].rename(index = str, columns ={series_data['match_overview'].columns[3]: "winner_of_match",series_data['match_overview'].columns[4]: "team_A_score",series_data['match_overview'].columns[5]: "team_B_score"})
    
    
    series_data_mo = pd.merge(series_data_m, series_data['match_overview'], on=['match_id', 'map', 'series_id'])
    
    return series_data_mo
#works at merging matches with match_overviewb

def merge_scoreboards(series_data_mo, series_data):    
   
    for i in range(len(series_data['scoreboards'][0])):
        series_data['scoreboards'][0][i]['match_num'] = i+1
        #pdb.set_trace()
        series_data['scoreboards'][0][i]['player_team_name'] = series_data_mo.loc[(series_data_mo['match_num']== i+1),'team_A_name'].unique()[0]
        series_data['scoreboards'][0][i] = series_data['scoreboards'][0][i].rename(index = str, columns={ series_data['scoreboards'][0][i].columns[0] : "team_players" })
        
        series_data['scoreboards'][1][i]['match_num'] = i+1
        series_data['scoreboards'][1][i]['player_team_name'] = series_data_mo.loc[(series_data_mo['match_num']== i+1),'team_B_name'].unique()[0]
        series_data['scoreboards'][1][i] = series_data['scoreboards'][1][i].rename(index = str, columns={ series_data['scoreboards'][1][i].columns[0] : "team_players"})
        
        new_df = pd.concat([series_data['scoreboards'][0][i], series_data['scoreboards'][1][i]])
        
        if i == 0:
            con_df = new_df
        else:
            con_df = pd.concat([con_df, new_df])
   
    series_data_mos = pd.merge(series_data_mo, con_df, how='outer', on='match_num')
    series_data_mos['nicknames'] = series_data_mos['team_players'].str.split(expand = True)[3]
    return series_data_mos

       
def match_data_board_changer(series_data_mos,series_data):
    board_name = ['first_kills','who_kill_who', 'awp_kills']
    for idx, a in enumerate(series_data['match_data']):
        new_df = pd.DataFrame()
        for idx1, c in enumerate(board_name):    
            new_board_c = pd.DataFrame()
            new_board_r= pd.DataFrame()
            names_c = a[c].set_index('Unnamed: 0').columns
            for b in names_c:
                new_board_c[b+'_'+c] = a[c].set_index('Unnamed: 0')[b].str.split(pat = ':', expand = True)[0]
               
            names_r = a[c].set_index('Unnamed: 0').T.columns
            for b in names_r:
                new_board_r[b+'_'+c] = a[c].set_index('Unnamed: 0').T[b].str.split(pat = ':', expand = True)[1]
            new_board_c['nicknames'] = new_board_c.index
            new_board_r['nicknames'] = new_board_r.index
            
            board_df = new_board_c.append(new_board_r)
            
            if idx1 == 0:
                new_df = board_df
            else:
                new_df = pd.merge(new_df, board_df, on = 'nicknames')
        if idx == 0:
            new_df['match_num'] = 1+idx
            con_df = new_df
        else:
            new_df['match_num'] = 1+idx
            try:
                con_df = con_df.append(new_df, ignore_index=True)
            except:
                print con_df.columns
                print new_df.columns
    con_df = con_df.loc[:, ~con_df.columns.duplicated()]
    series_data_mosm = pd.merge(series_data_mos, con_df, on=['nicknames','match_num'])
    return series_data_mosm


#works at adding match_num to scoreboards

## 2. Aggregation functions
    takes the data created by above functions, where each row is 1 player in 1 match, and aggregates 
    the player data so each row is 1 team in match

In [33]:
def create_fwa_dr_columns(data, col_list):  # first k , awp, who, divided by rounds 
    columns = pd.Series(data.columns)
    for a in col_list:
        col = columns[columns.str.contains(a)]
        data[a+'_sum_dr'] = data[col].convert_objects(convert_numeric = True).sum(axis = 1) / (data['team_A_score'] + data['team_B_score'])
    r = data.loc[:, data.columns != 'date'].apply(pd.to_numeric, errors='ignore')
    r['date'] = data.date
    return r
        
def create_fwadr_his(data, col_list):   # column with historic awp, first ,who_killwho of player vs player. 
    match_id = data.match_id.unique()
    players = data.nicknames.unique()
    for a in col_list:
        grouping = data.groupby(['nicknames', 'match_id'])[a+'_sum_dr'].mean() 
        grouping = pd.DataFrame(grouping)
        grouping = grouping.reset_index()
        grouping = grouping.groupby(['nicknames'])[a+'_sum_dr'].sum()/ grouping.groupby(['nicknames'])[a+'_sum_dr'].count()
        grouping = pd.DataFrame(grouping)
        grouping = grouping.reset_index()
        grouping = grouping.rename(index=str, columns={a +'_sum_dr': a + '_sum_dr_hist'})
        data = pd.merge(data, grouping, on = 'nicknames')
        
    return data

def create_matches_count(data): # how many matches a person has played
    grouping = data.groupby(['nicknames', 'match_id'])['ADR'].count()
    grouping = pd.DataFrame(grouping)
    grouping = grouping.reset_index()
    grouping = grouping.groupby(['nicknames'])['ADR'].count()
    grouping = pd.DataFrame(grouping)
    grouping = grouping.reset_index()
    grouping = grouping.rename(index=str, columns={'ADR': 'matches_played_player'})
    data = pd.merge(data, grouping, on = 'nicknames')
    return data
    
def create_avdamage_his(data):  # column with historic average damage of individual 
    grouping = data.groupby(['nicknames', 'match_id'])['ADR'].mean() 
    grouping = pd.DataFrame(grouping)
    grouping = grouping.reset_index()
    grouping = grouping.groupby(['nicknames'])['ADR'].sum()/ grouping.groupby(['nicknames'])['ADR'].count()
    grouping = pd.DataFrame(grouping)
    grouping = grouping.reset_index()
    grouping = grouping.rename(index=str, columns={'ADR': 'ADR_hist'})
    data = pd.merge(data, grouping, on = 'nicknames')
    
    return data

def create_map_win_loss_his(data):  # team total win and loses on map with total times played on map
    maps = data.map.unique()
    teams = data.player_team_name.unique()
    match_id = data.match_id.unique()
    for a in maps:
        data[a + "_win_his"] = 0
        data[a + "_loss_his"] = 0
        data[a + "_total_played"] = 0
    for a in match_id:
        map_for_match = data.loc[(data['match_id'] == a) ,'map'].unique()
        winner_of_map = data.loc[(data['match_id'] == a), 'winner_of_match'].unique()
        loser_of_map = data.loc[(data['match_id'] == a), 'loser_of_match'].unique()
        data.loc[(data['player_team_name'] == winner_of_map[0]), [map_for_match[0] + "_win_his", map_for_match[0] +'_total_played']] += 1
        data.loc[(data['player_team_name'] == loser_of_map[0]), [map_for_match[0] + "_loss_his", map_for_match[0] +'_total_played']] += 1 
       
    return data


def create_map_win_his_per(data): #percentage team total win and loses on map
    teams = data.player_team_name.unique()
    maps = data.map.unique()
    for a in maps:
        data[a + '_win_perc_map'] = 0
        for b in teams:
            pg = (data.player_team_name == b) 
            data.loc[pg,a + '_win_perc_map'] = data.loc[pg, a + "_win_his"].unique()[0] / float((data.loc[pg, a + "_win_his"].unique()[0] + data.loc[pg, a + "_loss_his"].unique()[0])) 
    data = data.fillna(0)        
    return data    

def create_rounds_won_vs_team_his(data): # team rounds won vs another team
    grouping = data.groupby(['match_id','map','team_A_name','team_B_name', 'team_A_score', 'team_B_score'])['round_num'].count()
    grouping = pd.DataFrame(grouping)
    grouping = grouping.add_suffix('_Count').reset_index()
    grouping = grouping.groupby(['team_A_name', 'team_B_name']).sum()
    grouping = pd.DataFrame(grouping)
    grouping = grouping.add_suffix('_Count').reset_index()
    forward = grouping.team_A_name+grouping.team_B_name
    reverse = grouping.team_B_name+grouping.team_A_name
    for idx, val in enumerate(forward):
        for idx2, val2 in enumerate(reverse):
            if val == val2 and idx < idx2:
                grouping.loc[idx,'team_A_score_Count'] += grouping.loc[idx2,'team_B_score_Count']
                grouping.loc[idx,'team_B_score_Count'] += grouping.loc[idx2,'team_A_score_Count']
                grouping.loc[idx2,'team_B_score_Count'] = grouping.loc[idx,'team_A_score_Count']
                grouping.loc[idx2,'team_A_score_Count'] = grouping.loc[idx,'team_B_score_Count']
            elif val == val2 and idx > idx2:
                grouping.loc[idx2,'team_B_score_Count'] = grouping.loc[idx,'team_A_score_Count']
                grouping.loc[idx2,'team_A_score_Count'] = grouping.loc[idx,'team_B_score_Count']
    
    grouping = grouping.drop('round_num_Count_Count', axis = 1)
    col1 = list(grouping.team_A_name.unique())
    col2  = list(grouping.team_B_name.unique())
    col = col1 + col2
    col = list(set(col))
    data = pd.merge(data,grouping, on=['team_A_name', 'team_B_name']) 
    for a in col:
        data['rd_total_his_'+ a] = 0
        data.loc[(data.player_team_name != a) & (data.team_A_name == a) , 'rd_total_his_'+ a]=data.team_B_score_Count
        data.loc[(data.player_team_name != a) & (data.team_B_name == a) , 'rd_total_his_'+ a]=data.team_A_score_Count
        bgrouping = data.groupby(['player_team_name'])['rd_total_his_'+ a].max()
        bgrouping = pd.DataFrame(bgrouping)
        bgrouping = bgrouping.reset_index()
        data = data.drop('rd_total_his_'+ a, axis = 1)
        data = pd.merge(data, bgrouping, on = 'player_team_name')
       
    return data  

def create_total_team_rd_map_his(data):
    grouping = data.groupby(['map','team_A_name', 'team_A_score'])['round_num'].count()
    grouping = pd.DataFrame(grouping)
    grouping = grouping.add_suffix('_Count').reset_index()
    grouping = grouping.groupby(['map','team_A_name'])[ 'team_A_score'].sum()
    grouping = pd.DataFrame(grouping)
    grouping = grouping.add_suffix('_Count').reset_index()
    fgrouping = data.groupby(['player_team_name','map','team_B_name', 'team_B_score'])['round_num'].count()
    fgrouping = pd.DataFrame(fgrouping)
    fgrouping = fgrouping.add_suffix('_Count').reset_index()
    fgrouping = fgrouping.groupby(['map','team_B_name'])[ 'team_B_score'].sum()
    fgrouping = pd.DataFrame(fgrouping)
    fgrouping = fgrouping.add_suffix('_Count').reset_index()
    fgrouping = fgrouping.rename(index=str, columns={"team_B_name": "team_A_name", 'team_B_score_Count': 'team_A_score_Count'})
    merged = pd.concat([grouping, fgrouping], axis = 0)
    merged.groupby(['map', 'team_A_name'])['team_A_score_Count'].sum()
    merged = pd.DataFrame(merged)
    merged = merged.reset_index(drop = True)
    merged = merged.rename(index=str, columns={"team_A_name": "player_team_name", "team_A_score_Count": 'total_team_rd_map'})
    merged = merged.groupby(['player_team_name', 'map']).sum()
    merged = pd.DataFrame(merged)
    merged = merged.reset_index()
    for a in list(merged.map.unique()):
        merged.loc[:,'total_team_rd_'+ a] = 0
        merged.loc[(merged.loc[:, 'map'] == a), 'total_team_rd_'+ a] = merged.loc[:,'total_team_rd_map']
        ok_map = merged.groupby(['player_team_name'])['total_team_rd_'+ a].max()
        ok_map = pd.DataFrame(ok_map)
        ok_map = ok_map.reset_index()
        data = pd.merge(data, ok_map, on = 'player_team_name')
    return data
    
def create_faw_map_his(data, col_list):
    for a in col_list:
        fk_map = data.groupby(['match_id','nicknames', 'map'])[a+'_sum_dr'].mean()
        fk_map = pd.DataFrame(fk_map)
        fk_map = fk_map.reset_index()
        fk_map = fk_map.groupby(['nicknames', 'map'])[a+'_sum_dr'].mean()
        fk_map = pd.DataFrame(fk_map)
        fk_map = fk_map.reset_index()
        for b in list(fk_map.map.unique()):
            fk_map.loc[:,a+'_'+b +'_dr_hist'] = 0 
            fk_map.loc[(fk_map.loc[:, 'map'] == b), a+'_'+b+'_dr_hist'] = fk_map.loc[:,a + '_sum_dr']
            ok_map = fk_map.groupby(['nicknames'])[a+'_'+b+'_dr_hist'].max()
            ok_map = pd.DataFrame(ok_map)
            ok_map = ok_map.reset_index()
            data = pd.merge(data, ok_map, on = 'nicknames')
    return data

def create_avdamage_map_his(data):# historic average damage of individual for each map
    map_adr = data.groupby(['match_id','nicknames', 'map'])['ADR'].mean()
    map_adr = pd.DataFrame(map_adr)
    map_adr = map_adr.reset_index()
    map_adr = map_adr.groupby(['nicknames', 'map'])['ADR'].mean()
    map_adr = pd.DataFrame(map_adr)
    map_adr = map_adr.reset_index()
    for a in list(map_adr.map.unique()):
        map_adr.loc[:,'ADR_his_'+ a] = 0
        map_adr.loc[(map_adr.loc[:, 'map'] == a), 'ADR_his_'+ a] = map_adr.loc[:,'ADR']
        ok_map = map_adr.groupby(['nicknames'])['ADR_his_'+ a].max()
        ok_map = pd.DataFrame(ok_map)
        ok_map = ok_map.reset_index()
        data = pd.merge(data, ok_map, on = 'nicknames')
    return data

def create_opponent_team_col(data):
    data.loc[:,'player_team_opponent'] = np.nan
    data.loc[(data['team_A_name'] != data['player_team_name']),'player_team_opponent'] = data.loc[:,'team_A_name']
    data.loc[(data['team_B_name'] != data['player_team_name']),'player_team_opponent'] = data.loc[:,'team_B_name']
    return data

In [16]:
def process_slice(data, col_list):
    data = create_fwa_dr_columns(data, col_list)
    data = create_fwadr_his(data,col_list)
    data = create_avdamage_his(data)
    data = create_map_win_loss_his(data)
    data = create_map_win_his_per(data)
    data = create_rounds_won_vs_team_his(data)
    data = create_total_team_rd_map_his(data)
    data = create_avdamage_map_his(data)
    data = create_faw_map_his(data, col_list)
    data = create_matches_count(data)
    data = create_opponent_team_col(data)
    
    data_adv = data.loc[:, 'first_kills_sum_dr_hist':'player_team_opponent']
    data_adv['match_id'] = data['match_id']
    data_adv['player_team_name'] = data['player_team_name']
    data_adv['map'] = data['map']
    data_adv = data_adv.drop(['team_A_score_Count','team_B_score_Count'], axis = 1)
    data_adv = data_adv.groupby(['match_id', 'player_team_name', 'player_team_opponent','map']).mean()
    data_adv = pd.DataFrame(data_adv)
    data_adv = data_adv.reset_index()
    data_adv = data_adv.apply(pd.to_numeric, errors='ignore')
    return data_adv


## 3. Creating the Dataset

In [None]:
with open('../scrapped_data/esl_teams.pkl', 'rb') as f: 
     d = pkl.load(f)

big_data = process_scrapped(d)
overview, big_data = big_data

data = combine_dfs(big_data, overview)
data.to_pickle("johns_dataset.pkl")

#add a date
for idx, row in overview.iterrows():
    series_id = row.id
    date = row.date
    data.loc[data.series_id == series_id, 'date'] = date

# sort dates    
data = data.sort_values('date')



In [7]:
data.to_pickle('player_rows.pkl')

In [44]:
def create_fwa_dr_columns(data, col_list):  # first k , awp, who, divided by rounds 
    columns = pd.Series(data.columns)
    for a in col_list:
        col = columns[columns.str.contains(a)]
        data[a+'_sum_dr'] = data[col].convert_objects(convert_numeric = True).sum(axis = 1) / (data['team_A_score'] + data['team_B_score'])
    r = data.loc[:, data.columns != 'date'].apply(pd.to_numeric, errors='ignore')
    r['date'] = data.date
    return data

In [None]:
#profiling the aggregation steps
import cProfile
import pstats

with open('../scrapped_data/esl_teams.pkl', 'rb') as f: 
     d = pkl.load(f)

big_data = process_scrapped(d)
overview, big_data = big_data

cProfile.run("data = combine_dfs(big_data, overview)", 'processing_results')

results = pstats.Stats('processing_results')
results.sort_stats('cumulative').print_stats(10)

"""
test_slice = data.loc[data.date==data.date.min(),:]
cProfile.run('process_slice(test_slice, col_list)')
cProfile.run('create_fwa_dr_columns(test_slice, col_list)', 'profile_results')
"""

In [94]:
with open('../scrapped_data/esl_teams.pkl', 'rb') as f: 
     d = pkl.load(f)

big_data = process_scrapped(d)
overview, big_data = big_data

def combine_dfs(big_data, overview):
    first=True
    cols = ['map','round_num','half','match_id','series_id','ending','CT','T','side_winner','winner','team_A','team_B','team_A_score','team_B_score','match_num','team_players','K-D','+/-','ADR','KAST','Rating2.0','nicknames']
    dfs = []
    for idx, series_data in big_data.iteritems():
        series_data_m = merge_matches(series_data)
        series_data_m["date"] = overview.loc[overview.id == idx, 'date']
        series_data_mo = merge_overview(series_data_m, series_data)
        series_data_mos = merge_scoreboards(series_data_mo, series_data)
        return series_data_mos, series_data
        series_data_mosm = match_data_board_changer(series_data_mos, series_data)
        dfs.append(series_data_mosm)    
        new_cols = list(set(series_data_mosm.columns) -set(cols))
        cols += new_cols
    data = pd.concat(dfs)
    data = data.loc[:,cols]
    data = data.reset_index()
    return data


series_data_mos, series_data = combine_dfs(big_data, overview)

In [96]:
cProfile.run("match_data_board_changer(series_data_mos, series_data)").sort_values('cum_time')

         95706 function calls (94263 primitive calls) in 0.088 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.001    0.001    0.088    0.088 <ipython-input-88-e1af2b42fd6e>:70(match_data_board_changer)
        1    0.000    0.000    0.088    0.088 <string>:1(<module>)
      349    0.000    0.000    0.000    0.000 __init__.py:189(iteritems)
        3    0.000    0.000    0.002    0.001 _decorators.py:125(wrapper)
        3    0.000    0.000    0.000    0.000 _methods.py:34(_prod)
      284    0.000    0.000    0.001    0.000 _methods.py:37(_any)
      208    0.000    0.000    0.001    0.000 _methods.py:40(_all)
       45    0.000    0.000    0.000    0.000 _validators.py:221(validate_bool_kwarg)
        3    0.000    0.000    0.000    0.000 _validators.py:230(validate_axis_style_args)
      366    0.000    0.000    0.000    0.000 _weakrefset.py:70(__contains__)
      365    0.000    0.000    0.001    0.000 ab

        3    0.000    0.000    0.000    0.000 concat.py:504(_maybe_check_integrity)
        3    0.000    0.000    0.000    0.000 concat.py:512(_concat_indexes)
        3    0.000    0.000    0.000    0.000 concat.py:89(_get_frame_result_type)
       27    0.000    0.000    0.000    0.000 concat.py:95(<genexpr>)
        9    0.000    0.000    0.000    0.000 copy.py:113(_copy_with_constructor)
     24/6    0.000    0.000    0.000    0.000 copy.py:145(deepcopy)
        6    0.000    0.000    0.000    0.000 copy.py:198(_deepcopy_atomic)
        6    0.000    0.000    0.000    0.000 copy.py:226(_deepcopy_list)
        6    0.000    0.000    0.000    0.000 copy.py:234(_deepcopy_tuple)
       24    0.000    0.000    0.000    0.000 copy.py:267(_keep_alive)
        6    0.000    0.000    0.000    0.000 copy.py:306(_reconstruct)
        9    0.000    0.000    0.000    0.000 copy.py:66(copy)
       57    0.000    0.000    0.000    0.000 dtypes.py:269(construct_from_string)
        3    0.000    

       60    0.000    0.000    0.002    0.000 series.py:3136(_sanitize_array)
       60    0.000    0.000    0.002    0.000 series.py:3153(_try_cast)
      108    0.000    0.000    0.000    0.000 series.py:326(_set_subtyp)
      114    0.000    0.000    0.000    0.000 series.py:336(name)
      108    0.000    0.000    0.000    0.000 series.py:340(name)
      120    0.000    0.000    0.000    0.000 series.py:347(dtype)
       30    0.000    0.000    0.000    0.000 series.py:367(values)
       74    0.000    0.000    0.000    0.000 series.py:400(_values)
       30    0.000    0.000    0.000    0.000 series.py:483(__len__)
        9    0.000    0.000    0.000    0.000 shape_base.py:182(vstack)
       99    0.000    0.000    0.000    0.000 shape_base.py:63(atleast_2d)
        3    0.000    0.000    0.000    0.000 sorting.py:119(is_int64_overflow_possible)
       30    0.000    0.000    0.002    0.000 strings.py:1002(str_split)
      150    0.000    0.000    0.000    0.000 strings.py:1031(<

AttributeError: 'NoneType' object has no attribute 'sort_values'

In [None]:
#process and appended new matches to full_dataset df once a day
col_list = ['first_kills', 'who_kill_who','awp_kills']
full_dataset = pd.DataFrame()
first = True
for date in data.date.unique():
    print date
    cur_slice = data.loc[data.date <= date,:]
    new_df = process_slice(cur_slice, col_list)
    if not first:
        new_df = new_df.loc[~new_df.match_id.isin(full_dataset.match_id),]
    else:
        first = False
    full_dataset = pd.concat([full_dataset, new_df])

## Creating X and y functions

In [57]:
datay = data.loc[:,['winner_of_match', 'player_team_name', 'match_id']]

In [58]:
data_adv = pd.get_dummies(data_adv,columns = ['player_team_name', 'player_team_opponent','map'])
data_adv = data_adv.drop(['match_id'], axis = 1)

In [59]:
datay.loc[datay.winner_of_match != datay.player_team_name, 'winner_of_match'] = 0
datay.loc[datay.winner_of_match == datay.player_team_name, 'winner_of_match'] = 1
datay.winner_of_match = datay.winner_of_match.apply(pd.to_numeric, errors='ignore')

datay = datay.groupby(['match_id', 'player_team_name'])['winner_of_match'].mean()
datay = pd.DataFrame(datay)
datay = datay.reset_index()
datay = datay.drop(['player_team_name','match_id'], axis = 1)

In [1037]:
#data_adv = data_adv.drop(['Mirage_win_his','Mirage_loss_his','Train_win_his','Train_loss_his','Cobblestone_win_his'
#                      ,'Cobblestone_loss_his','Cache_win_his','Cache_loss_his','Inferno_win_his','Inferno_loss_his'
#                      ,'Overpass_win_his','Overpass_loss_his','Nuke_win_his','Nuke_loss_his'], axis = 1)


# ,'who_kill_who_Cache_dr_hist'
#                       ,'who_kill_who_Cobblestone_dr_hist','who_kill_who_Inferno_dr_hist','who_kill_who_Mirage_dr_hist'
#                       ,'who_kill_who_Nuke_dr_hist','who_kill_who_Overpass_dr_hist','who_kill_who_Train_dr_hist'
#                       , 'who_kill_who_sum_dr_hist'], axis = 1)

In [1038]:
data_adv = data_adv.round(4)

In [1039]:
y = datay.values.astype(int)
X = data_adv.values

In [1040]:
def local_cv(model, params):                             #KFOLD WITH GRID SEARCH
    param_grid = params
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
    grid_result = grid.fit(X, y)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    for params, mean_score, scores in grid_result.grid_scores_:
        print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

In [1041]:
num_folds = 10
num_instances = len(X) 
seed = 7
scoring = 'roc_auc'


In [1042]:
#X = data_adv.iloc[:,top_56_important_features]

In [1057]:
models = []
models.append(('LR', LogisticRegression(random_state = seed)))
models.append(('LASSO', Lasso())) 
models.append(('Ridge', Ridge())) 
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))#ewights = 'distance' 
models.append(('XGBClassifier', xgb.XGBClassifier()))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state = seed)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state = seed)))
models.append(('RandomForestClassifier', RandomForestClassifier(random_state = seed)))
models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state = seed)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state = seed)))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(shuffle = True, n_splits=num_folds, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring = scoring)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.692009 (0.041771)
LASSO: 0.605929 (0.047575)
Ridge: 0.693587 (0.040757)
LDA: 0.683520 (0.041036)
NB: 0.673947 (0.044565)
KNeighborsClassifier: 0.584792 (0.056266)
XGBClassifier: 0.691801 (0.041618)
GradientBoostingClassifier: 0.676411 (0.037104)
AdaBoostClassifier: 0.692027 (0.045368)
RandomForestClassifier: 0.605450 (0.044938)
ExtraTreesClassifier: 0.572198 (0.055233)
DecisionTreeClassifier: 0.576211 (0.064759)


In [484]:
estimator = LinearDiscriminantAnalysis()
rfe = RFECV(estimator,cv = kfold)
fit = rfe.fit(X,y)
print("Num of feature: %d") % fit.n_features_
#print("Selected features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Num of feature: 85
Feature Ranking: [ 1  1  1  1  4  1 10  1  1  1  1  1  1  1  1  1  1  1  1  9  1  1  1  1  1
 11  5  1  1  1  3  1  1  8  1  6  7  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  2  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1]


In [485]:
top_85_important_features = [] 
for b in range(0,len(fit.ranking_)):
    if fit.ranking_[b] == 1:
        top_85_important_features.append(b)
        print b,data_adv.columns[b]

0 first_kills_sum_dr_hist
1 awp_kills_sum_dr_hist
2 ADR_hist
3 Train_total_played
5 Cache_total_played
7 Overpass_total_played
8 Mirage_total_played
9 Nuke_total_played
10 Train_win_perc_map
11 Cobblestone_win_perc_map
12 Cache_win_perc_map
13 Inferno_win_perc_map
14 Overpass_win_perc_map
15 Mirage_win_perc_map
16 Nuke_win_perc_map
17 rd_total_his_Ghost
18 rd_total_his_OpTic
20 rd_total_his_Luminosity
21 rd_total_his_CLG
22 rd_total_his_NRG
23 rd_total_his_Cloud9
24 rd_total_his_Immortals
27 rd_total_his_Rogue
28 rd_total_his_Misfits
29 rd_total_his_Splyce
31 total_team_rd_Cache
32 total_team_rd_Cobblestone
34 total_team_rd_Mirage
37 total_team_rd_Nuke
38 ADR_his_Cache
39 ADR_his_Cobblestone
40 ADR_his_Inferno
41 ADR_his_Mirage
42 ADR_his_Nuke
43 ADR_his_Overpass
44 ADR_his_Train
45 first_kills_Cache_dr_hist
46 first_kills_Cobblestone_dr_hist
47 first_kills_Inferno_dr_hist
48 first_kills_Mirage_dr_hist
49 first_kills_Nuke_dr_hist
50 first_kills_Overpass_dr_hist
51 first_kills_Train_dr_

In [1054]:
for a in range(0, len(models)):
    model1 = models[a]
    for b in range(a+1, len(models)):
        model2 = models[b]
        for c in range(b+1, len(models)):
            model3 = models[c]
            estimators = []
            estimators.append(model1)
            estimators.append(model2)
            estimators.append(model3)
            ensemble = VotingClassifier(estimators, voting='soft')
            results = model_selection.cross_val_score(ensemble, X, y, cv=kfold, scoring= scoring)
            if results.mean() > .69:
                print(model1[0], model2[0], model3[0],results.mean(), results.std())

KeyboardInterrupt: 