In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from xgboost import XGBRegressor,XGBClassifier
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC,LinearSVR,SVC,SVR
from scipy.stats import norm,skew

# Setting the number of columns to display
pd.set_option('display.max_columns', None)
# year
yr=2014

In [2]:
cities=pd.read_csv("WPrelimData2018/WCities_PrelimData2018.csv")
game_cities=pd.read_csv("WPrelimData2018/WGameCities_PrelimData2018.csv")

tourney_seeds=pd.read_csv("WDataFiles/WNCAATourneySeeds.csv")

regular_season_results=pd.read_csv("WPrelimData2018/WRegularSeasonCompactResults_PrelimData2018.csv")
detailed_regular_season=pd.read_csv("WPrelimData2018/WRegularSeasonDetailedResults_PrelimData2018.csv")

tourney_results=pd.read_csv("WPrelimData2018/WNCAATourneyCompactResults_PrelimData2018.csv")
detailed_tourney=pd.read_csv("WPrelimData2018/WNCAATourneyDetailedResults_PrelimData2018.csv")


sample_submission=pd.read_csv("WDataFiles/WSampleSubmissionStage1.csv")

print("Regular season results",regular_season_results.shape)
print("Tourney results",tourney_results.shape)

Regular season results (101450, 8)
Tourney results (1260, 8)


In [3]:
stats=pd.read_csv("dont_trust_reg.csv")
stats_ensemble=pd.read_csv("dont_trust_reg_lr_xgb.csv")
reg=pd.read_csv("Trying_something_new.csv")
reg_ensemble=pd.read_csv("Trying_something_new_ensemble.csv")

In [4]:
combo=pd.DataFrame()
combo['ID']=stats['ID']


def yr(s):
    yr,team1,team2=map(int,s.split('_'))
    return yr;

def t1(s):
    yr,team1,team2=map(int,s.split('_'))
    return team1;

def t2(s):
    yr,team1,team2=map(int,s.split('_'))
    return team2;

combo['Season']=combo['ID'].apply(yr)
combo['Team1']=combo['ID'].apply(t1)
combo['Team2']=combo['ID'].apply(t2)


combo['Pred_stats']=stats['Pred']
combo['Pred_stats_ensemble']=stats_ensemble['Pred']
combo['Pred_reg']=reg['Pred']
combo['Pred_reg_ensemble']=reg_ensemble['Pred']
# combo.head()


result=pd.DataFrame()
result['Season']=tourney_results.loc[tourney_results['Season']>=2014,'Season']
result['Team1']=tourney_results.loc[tourney_results['Season']>=2014].apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
result['Team2']=tourney_results.loc[tourney_results['Season']>=2014].apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)

result['match_taken_place']=1
result['true_Pred'] = tourney_results.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)



final=pd.merge(left=combo,right=result,how='left',on=['Season','Team1','Team2'])
final.match_taken_place.fillna(0,inplace=True)
final=final.loc[final['match_taken_place']==1.0]
final.head()

Unnamed: 0,ID,Season,Team1,Team2,Pred_stats,Pred_stats_ensemble,Pred_reg,Pred_reg_ensemble,match_taken_place,true_Pred
44,2014_3103_3345,2014,3103,3345,0.122727,0.119263,0.064634,0.077057,1.0,0.0
121,2014_3107_3452,2014,3107,3452,0.048167,0.02657,0.004531,0.012056,1.0,0.0
160,2014_3113_3323,2014,3113,3323,0.047461,0.049084,0.176736,0.152665,1.0,0.0
180,2014_3113_3435,2014,3113,3435,0.365452,0.318079,0.455259,0.439536,1.0,1.0
212,2014_3119_3268,2014,3119,3268,0.056776,0.030875,0.064634,0.077057,1.0,0.0


In [5]:
print(log_loss(final['true_Pred'],final['Pred_reg']))
print(log_loss(final['true_Pred'],final['Pred_stats']))
print(log_loss(final['true_Pred'],final['Pred_stats_ensemble']))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']))

print("Ensemble with stats and 0.4498")
print("-------------")
print(log_loss(final['true_Pred'],final['Pred_reg']*0.5+final['Pred_stats']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg']*0.25+final['Pred_stats']*0.75))     
print(log_loss(final['true_Pred'],final['Pred_reg']*0.75+final['Pred_stats']*0.25))             # best
print(log_loss(final['true_Pred'],final['Pred_reg']*0.80+final['Pred_stats']*0.20))     
print('------------------------')

print("Ensemble with stats(ensemble) and 0.4498")
print('-----------------')
print(log_loss(final['true_Pred'],final['Pred_reg']*0.5+final['Pred_stats_ensemble']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg']*0.25+final['Pred_stats_ensemble']*0.75))            
print(log_loss(final['true_Pred'],final['Pred_reg']*0.75+final['Pred_stats_ensemble']*0.25))
print(log_loss(final['true_Pred'],final['Pred_reg']*0.80+final['Pred_stats_ensemble']*0.20))  
print("-----------------")

print("Ensemble with stats and 0.4498(ensemble)")
print("-------------------------------")
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.5+final['Pred_stats']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.25+final['Pred_stats']*0.75))     
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.75+final['Pred_stats']*0.25))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.80+final['Pred_stats']*0.20))  
print("-----------------------------")

print("Ensembling of the both")
print("---------------------------")
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.5+final['Pred_stats_ensemble']*0.5))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.25+final['Pred_stats_ensemble']*0.75))            
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.75+final['Pred_stats_ensemble']*0.25))
print(log_loss(final['true_Pred'],final['Pred_reg_ensemble']*0.80+final['Pred_stats_ensemble']*0.20))
print("------------------------")

0.449069819997
0.46307665481
0.471415749266
0.451655513538
Ensemble with stats and 0.4498
-------------
0.444559351992
0.449941514656
0.444408393269
0.444949233116
------------------------
Ensemble with stats(ensemble) and 0.4498
-----------------
0.445377753838
0.452273453048
0.444667464182
0.445140155711
-----------------
Ensemble with stats and 0.4498(ensemble)
-------------------------------
0.446446434101
0.451172907727
0.446703842129
0.447308443376
-----------------------------
Ensembling of the both
---------------------------
0.446999963372
0.453343208753
0.446741502081
0.447305784772
------------------------


In [6]:
a=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
b=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
c=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]


for i in a:
    for j in b:
        for k in c:
            print(i,j,k,log_loss(final['true_Pred'],i*final['Pred_reg']+j*final['Pred_reg_ensemble']
                                    +k*final['Pred_stats']+(1-i-j-k)*final['Pred_stats_ensemble']))

0.1 0.1 0.1 0.454369964448
0.1 0.1 0.2 0.453701448111
0.1 0.1 0.3 0.453157622285
0.1 0.1 0.4 0.452737602956
0.1 0.1 0.5 0.452441359
0.1 0.1 0.6 0.452269790745
0.1 0.1 0.7 0.452224861877
0.1 0.1 0.8 0.452309806142
0.1 0.1 0.9 0.452529445413
0.1 0.2 0.1 0.450443873913
0.1 0.2 0.2 0.4499601239
0.1 0.2 0.3 0.449591967626
0.1 0.2 0.4 0.449339692754
0.1 0.2 0.5 0.449204369241
0.1 0.2 0.6 0.449187968949
0.1 0.2 0.7 0.449293545741
0.1 0.2 0.8 0.449525505349
0.1 0.2 0.9 0.449890014981
0.1 0.3 0.1 0.447848244811
0.1 0.3 0.2 0.447503141565
0.1 0.3 0.3 0.447269094621
0.1 0.3 0.4 0.447147123921
0.1 0.3 0.5 0.447139045707
0.1 0.3 0.6 0.447247626776
0.1 0.3 0.7 0.447476812437
0.1 0.3 0.8 0.447832067571
0.1 0.3 0.9 0.448320899324
0.1 0.4 0.1 0.446315591439
0.1 0.4 0.2 0.446083112845
0.1 0.4 0.3 0.445959802673
0.1 0.4 0.4 0.445947250402
0.1 0.4 0.5 0.446047901865
0.1 0.4 0.6 0.44626525175
0.1 0.4 0.7 0.446604128822
0.1 0.4 0.8 0.447071127351
0.1 0.4 0.9 0.447675280279
0.1 0.5 0.1 0.445697929296
0.1 0.5

0.5 0.1 0.5 0.444388764401
0.5 0.1 0.6 0.444647795321
0.5 0.1 0.7 0.445024200441
0.5 0.1 0.8 0.445523928334
0.5 0.1 0.9 0.44615502129
0.5 0.2 0.1 0.444722309258
0.5 0.2 0.2 0.444631428392
0.5 0.2 0.3 0.444647705457
0.5 0.2 0.4 0.444773108402
0.5 0.2 0.5 0.445010481121
0.5 0.2 0.6 0.445363747948
0.5 0.2 0.7 0.445838215929
0.5 0.2 0.8 0.446441031978
0.5 0.2 0.9 0.447181897393
0.5 0.3 0.1 0.445775186217
0.5 0.3 0.2 0.445768461738
0.5 0.3 0.3 0.445871585067
0.5 0.3 0.4 0.446087091998
0.5 0.3 0.5 0.446418534859
0.5 0.3 0.6 0.446870741767
0.5 0.3 0.7 0.447450204035
0.5 0.3 0.8 0.448165671357
0.5 0.3 0.9 0.449029101985
0.5 0.4 0.1 0.447623039766
0.5 0.4 0.2 0.447704166612
0.5 0.4 0.3 0.447899728967
0.5 0.4 0.4 0.448213058324
0.5 0.4 0.5 0.448648719881
0.5 0.4 0.6 0.449212855262
0.5 0.4 0.7 0.449913700188
0.5 0.4 0.8 0.450762392276
0.5 0.4 0.9 0.451774288074
0.5 0.5 0.1 0.450318589398
0.5 0.5 0.2 0.450499238248
0.5 0.5 0.3 0.450801894776
0.5 0.5 0.4 0.451231224516
0.5 0.5 0.5 0.451793527217
0.

0.8 0.9 0.3 0.863843346582
0.8 0.9 0.4 0.864371262064
0.8 0.9 0.5 0.865076352277
0.8 0.9 0.6 0.865978341187
0.8 0.9 0.7 0.867107129173
0.8 0.9 0.8 0.868547270401
0.8 0.9 0.9 0.870455527673
0.9 0.1 0.1 0.449297981483
0.9 0.1 0.2 0.449419454751
0.9 0.1 0.3 0.449657800936
0.9 0.1 0.4 0.450016520809
0.9 0.1 0.5 0.450500388247
0.9 0.1 0.6 0.451115786467
0.9 0.1 0.7 0.451871207013
0.9 0.1 0.8 0.45277801063
0.9 0.1 0.9 0.453861848121
0.9 0.2 0.1 0.452942093229
0.9 0.2 0.2 0.453174919853
0.9 0.2 0.3 0.453536154051
0.9 0.2 0.4 0.454031152576
0.9 0.2 0.5 0.454667470221
0.9 0.2 0.6 0.455455346563
0.9 0.2 0.7 0.456408639563
0.9 0.2 0.8 0.457561059966
0.9 0.2 0.9 0.458973110879
0.9 0.3 0.1 0.457786970982
0.9 0.3 0.2 0.458182298045
0.9 0.3 0.3 0.458728859866
0.9 0.3 0.4 0.45943859001
0.9 0.3 0.5 0.46032834247
0.9 0.3 0.6 0.461422490878
0.9 0.3 0.7 0.462776825072
0.9 0.3 0.8 0.464475274993
0.9 0.3 0.9 0.466637005476
0.9 0.4 0.1 0.464247688573
0.9 0.4 0.2 0.464919645684
0.9 0.4 0.3 0.465817319537
0.9 

In [7]:
print(log_loss(final['true_Pred'],final['Pred_reg']*0.65+final['Pred_stats']*0.35))

0.443893969133


In [8]:
a=np.arange(0,1+0.05,0.05)
for i in a:
    print(i,log_loss(final['true_Pred'],final['Pred_reg']*i+final['Pred_stats']*(1-i)))

0.0 0.46307665481
0.05 0.459513987824
0.1 0.456516058826
0.15 0.453966669584
0.2 0.451791988662
0.25 0.449941514656
0.3 0.448378999501
0.35 0.447077576763
0.4 0.446016919891
0.45 0.445181481269
0.5 0.444559351992
0.55 0.444141501001
0.6 0.443921259036
0.65 0.443893969133
0.7 0.444056756959
0.75 0.444408393269
0.8 0.444949233116
0.85 0.445681225511
0.9 0.446607994974
0.95 0.447735004544
1.0 0.449069819997


**So we can get 0.443893 with 0.65 pred_reg and 0.35 pred_stats**

In [9]:
a=np.arange(0.65,0.75,0.0005)
for i in a:
    print(i,log_loss(final['true_Pred'],final['Pred_reg']*i+final['Pred_stats']*(1-i)))

0.65 0.443893969133
0.6505 0.443894659506
0.651 0.443895368872
0.6515 0.443896097228
0.652 0.443896844572
0.6525 0.443897610902
0.653 0.443898396217
0.6535 0.443899200514
0.654 0.443900023791
0.6545 0.443900866046
0.655 0.443901727278
0.6555 0.443902607485
0.656 0.443903506665
0.6565 0.443904424815
0.657 0.443905361935
0.6575 0.443906318022
0.658 0.443907293075
0.6585 0.443908287092
0.659 0.44390930007
0.6595 0.443910332009
0.66 0.443911382906
0.6605 0.44391245276
0.661 0.44391354157
0.6615 0.443914649333
0.662 0.443915776047
0.6625 0.443916921712
0.663 0.443918086326
0.6635 0.443919269886
0.664 0.443920472391
0.6645 0.443921693841
0.665 0.443922934232
0.6655 0.443924193565
0.666 0.443925471836
0.6665 0.443926769045
0.667 0.44392808519
0.6675 0.443929420269
0.668 0.443930774282
0.6685 0.443932147226
0.669 0.4439335391
0.6695 0.443934949904
0.67 0.443936379634
0.6705 0.443937828291
0.671 0.443939295873
0.6715 0.443940782377
0.672 0.443942287804
0.6725 0.443943812151
0.673 0.443945355418

In [10]:
a=np.arange(0.6,0.70,0.0005)
for i in a:
    print(i,log_loss(final['true_Pred'],final['Pred_reg']*i+final['Pred_stats']*(1-i)))

0.6 0.443921259036
0.6005 0.443920037233
0.601 0.443918834706
0.6015 0.443917651453
0.602 0.443916487468
0.6025 0.44391534275
0.603 0.443914217293
0.6035 0.443913111095
0.604 0.443912024152
0.6045 0.44391095646
0.605 0.443909908016
0.6055 0.443908878816
0.606 0.443907868856
0.6065 0.443906878134
0.607 0.443905906645
0.6075 0.443904954386
0.608 0.443904021354
0.6085 0.443903107545
0.609 0.443902212956
0.6095 0.443901337583
0.61 0.443900481423
0.6105 0.443899644472
0.611 0.443898826728
0.6115 0.443898028186
0.612 0.443897248844
0.6125 0.443896488699
0.613 0.443895747746
0.6135 0.443895025983
0.614 0.443894323406
0.6145 0.443893640012
0.615 0.443892975798
0.6155 0.443892330761
0.616 0.443891704897
0.6165 0.443891098204
0.617 0.443890510678
0.6175 0.443889942316
0.618 0.443889393115
0.6185 0.443888863072
0.619 0.443888352183
0.6195 0.443887860446
0.62 0.443887387858
0.6205 0.443886934415
0.621 0.443886500115
0.6215 0.443886084955
0.622 0.443885688931
0.6225 0.44388531204
0.623 0.4438849542

In [11]:
print(log_loss(final['true_Pred'],final['Pred_reg']*0.6325+final['Pred_stats']*(1-0.6325)))

0.443881787869


**so we can get 0.443881 with 0.6325 and with 0.65 we can get 0.443893.**

In [24]:
a=64
b=
print(log_loss(final['true_Pred'][a:b],final['Pred_reg'][a:b]*0.6325+final['Pred_stats'][a:b]*(1-0.6325)))

0.429239661281


## CLIPPING 

In [12]:
threshold=0.07

k=final['Pred_reg']*0.65+final['Pred_stats']*0.35
clip=pd.DataFrame()
clip['Pred']=k
clip.loc[clip['Pred']<threshold,'Pred']=0
clip.loc[clip['Pred']>1-threshold,'Pred']=1

print(log_loss(final['true_Pred'],clip['Pred']))

0.438422514446


**if we clip with 0.00 we get a log loss of 0.443893**

**if we clip with 0.05 we get a log loss of 0.440**

**if we clip with 0.07 we get a log loss of 0.4384**

**if we clip with 0.1 we get a log loss of 0.4350**

In [13]:
threshold=0.07

k=final['Pred_reg']*0.6325+final['Pred_stats']*0.3675
clip=pd.DataFrame()
clip['Pred']=k
clip.loc[clip['Pred']<threshold,'Pred']=0
clip.loc[clip['Pred']>1-threshold,'Pred']=1

print(log_loss(final['true_Pred'],clip['Pred']))

0.438321300544


**if we clip with 0.00 we get a log loss of 0.443881**

**if we clip with 0.05 we get a log loss of 0.4403**

**if we clip with 0.07 we get a log loss of 0.4383**

**if we clip with 0.1 we get a log loss of 0.4349**