In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from xgboost import XGBRegressor,XGBClassifier
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC,LinearSVR,SVC,SVR
from scipy.stats import norm,skew

# Setting the number of columns to display
pd.set_option('display.max_columns', None)
# year
yr=2014

In [2]:
cities=pd.read_csv("WPrelimData2018/WCities_PrelimData2018.csv")
game_cities=pd.read_csv("WPrelimData2018/WGameCities_PrelimData2018.csv")

tourney_seeds=pd.read_csv("WDataFiles/WNCAATourneySeeds.csv")

regular_season_results=pd.read_csv("WPrelimData2018/WRegularSeasonCompactResults_PrelimData2018.csv")
detailed_regular_season=pd.read_csv("WPrelimData2018/WRegularSeasonDetailedResults_PrelimData2018.csv")

tourney_results=pd.read_csv("WPrelimData2018/WNCAATourneyCompactResults_PrelimData2018.csv")
detailed_tourney=pd.read_csv("WPrelimData2018/WNCAATourneyDetailedResults_PrelimData2018.csv")


sample_submission=pd.read_csv("WDataFiles/WSampleSubmissionStage1.csv")

print("Regular season results",regular_season_results.shape)
print("Tourney results",tourney_results.shape)

Regular season results (101450, 8)
Tourney results (1260, 8)


In [3]:
stats=pd.read_csv("dont_trust_reg.csv")
stats_ensemble=pd.read_csv("dont_trust_reg_lr_xgb.csv")
reg=pd.read_csv("Trying_something_new.csv")
reg_ensemble=pd.read_csv("Trying_something_new_ensemble.csv")

In [4]:
combo=pd.DataFrame()
combo['ID']=stats['ID']


def yr(s):
    yr,team1,team2=map(int,s.split('_'))
    return yr;

def t1(s):
    yr,team1,team2=map(int,s.split('_'))
    return team1;

def t2(s):
    yr,team1,team2=map(int,s.split('_'))
    return team2;

combo['Season']=combo['ID'].apply(yr)
combo['Team1']=combo['ID'].apply(t1)
combo['Team2']=combo['ID'].apply(t2)


combo['Pred_stats']=stats['Pred']
combo['Pred_stats_ensemble']=stats_ensemble['Pred']
combo['Pred_reg']=reg['Pred']
combo['Pred_reg_ensemble']=reg_ensemble['Pred']
# combo.head()


result=pd.DataFrame()
result['Season']=tourney_results.loc[tourney_results['Season']>=2014,'Season']
result['Team1']=tourney_results.loc[tourney_results['Season']>=2014].apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
result['Team2']=tourney_results.loc[tourney_results['Season']>=2014].apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)

result['match_taken_place']=1
result['true_Pred'] = tourney_results.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)



final=pd.merge(left=combo,right=result,how='left',on=['Season','Team1','Team2'])
final.match_taken_place.fillna(0,inplace=True)
final=final.loc[final['match_taken_place']==1.0]
final.head()

Unnamed: 0,ID,Season,Team1,Team2,Pred_stats,Pred_stats_ensemble,Pred_reg,Pred_reg_ensemble,match_taken_place,true_Pred
44,2014_3103_3345,2014,3103,3345,0.122727,0.119263,0.064634,0.077057,1.0,0.0
121,2014_3107_3452,2014,3107,3452,0.048167,0.02657,0.004531,0.012056,1.0,0.0
160,2014_3113_3323,2014,3113,3323,0.047461,0.049084,0.176736,0.152665,1.0,0.0
180,2014_3113_3435,2014,3113,3435,0.365452,0.318079,0.455259,0.439536,1.0,1.0
212,2014_3119_3268,2014,3119,3268,0.056776,0.030875,0.064634,0.077057,1.0,0.0


In [5]:
print(log_loss(final['true_Pred'],final['Pred_reg']))
print(log_loss(final['true_Pred'],final['Pred_stats']))
print(log_loss(final['true_Pred'],final['Pred_stats_ensemble']))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']))

print("Ensemble with stats and 0.4498")
print("-------------")
print(log_loss(final['true_Pred'],final['Pred_reg']*0.5+final['Pred_stats']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg']*0.25+final['Pred_stats']*0.75))     
print(log_loss(final['true_Pred'],final['Pred_reg']*0.75+final['Pred_stats']*0.25))             # best
print(log_loss(final['true_Pred'],final['Pred_reg']*0.80+final['Pred_stats']*0.20))     
print('------------------------')

print("Ensemble with stats(ensemble) and 0.4498")
print('-----------------')
print(log_loss(final['true_Pred'],final['Pred_reg']*0.5+final['Pred_stats_ensemble']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg']*0.25+final['Pred_stats_ensemble']*0.75))            
print(log_loss(final['true_Pred'],final['Pred_reg']*0.75+final['Pred_stats_ensemble']*0.25))
print(log_loss(final['true_Pred'],final['Pred_reg']*0.80+final['Pred_stats_ensemble']*0.20))  
print("-----------------")

print("Ensemble with stats and 0.4498(ensemble)")
print("-------------------------------")
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.5+final['Pred_stats']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.25+final['Pred_stats']*0.75))     
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.75+final['Pred_stats']*0.25))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.80+final['Pred_stats']*0.20))  
print("-----------------------------")

print("Ensembling of the both")
print("---------------------------")
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.5+final['Pred_stats_ensemble']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.25+final['Pred_stats_ensemble']*0.75))            
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.75+final['Pred_stats_ensemble']*0.25))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.80+final['Pred_stats_ensemble']*0.20))
print("------------------------")

0.449069819997
0.46307665481
0.471415749266
0.451655513538
Ensemble with stats and 0.4498
-------------
0.444559351992
0.449941514656
0.444408393269
0.444949233116
------------------------
Ensemble with stats(ensemble) and 0.4498
-----------------
0.445377753838
0.452273453048
0.444667464182
0.445140155711
-----------------
Ensemble with stats and 0.4498(ensemble)
-------------------------------
0.446446434101
0.451172907727
0.446703842129
0.447308443376
-----------------------------
Ensembling of the both
---------------------------
0.446999963372
0.453343208753
0.446741502081
0.447305784772
------------------------


In [17]:
a=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
b=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
c=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]


for i in a:
    for j in b:
        if(i+j < 1):
            for k in c:
                if(i+j+k<1):
                    print(i,j,k,log_loss(final['true_Pred'],i*final['Pred_reg']+j*final['Pred_reg_ensemble']
                                         +k*final['Pred_stats']+(1-i-j-k)*final['Pred_stats_ensemble']))

0.1 0.1 0.1 0.454369964448
0.1 0.1 0.2 0.453701448111
0.1 0.1 0.3 0.453157622285
0.1 0.1 0.4 0.452737602956
0.1 0.1 0.5 0.452441359
0.1 0.1 0.6 0.452269790745
0.1 0.1 0.7 0.452224861877
0.1 0.2 0.1 0.450443873913
0.1 0.2 0.2 0.4499601239
0.1 0.2 0.3 0.449591967626
0.1 0.2 0.4 0.449339692754
0.1 0.2 0.5 0.449204369241
0.1 0.2 0.6 0.449187968949
0.1 0.3 0.1 0.447848244811
0.1 0.3 0.2 0.447503141565
0.1 0.3 0.3 0.447269094621
0.1 0.3 0.4 0.447147123921
0.1 0.3 0.5 0.447139045707
0.1 0.4 0.1 0.446315591439
0.1 0.4 0.2 0.446083112845
0.1 0.4 0.3 0.445959802673
0.1 0.4 0.4 0.445947250402
0.1 0.5 0.1 0.445697929296
0.1 0.5 0.2 0.445563193137
0.1 0.5 0.3 0.445537597978
0.1 0.6 0.1 0.445915976615
0.1 0.6 0.2 0.445871458615
0.1 0.7 0.1 0.44693809927
0.2 0.1 0.1 0.450038003959
0.2 0.1 0.2 0.449546946861
0.2 0.1 0.3 0.449170719958
0.2 0.1 0.4 0.448909509494
0.2 0.1 0.5 0.448764254529
0.2 0.1 0.6 0.448736755855
0.2 0.2 0.1 0.447479203342
0.2 0.2 0.2 0.44712536415
0.2 0.2 0.3 0.446881831822
0.2 0.2 

In [21]:
log_loss(final['true_Pred'],0.15*final['Pred_reg']+0.35*final['Pred_reg_ensemble']
                                         +0.25*final['Pred_stats']+0.25*final['Pred_stats_ensemble'])

0.44582744718947848