In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from xgboost import XGBRegressor,XGBClassifier
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC,LinearSVR,SVC,SVR
from scipy.stats import norm,skew

# Setting the number of columns to display
pd.set_option('display.max_columns', None)
# year
yr=2014

In [2]:
cities=pd.read_csv("WStage2DataFiles/WCities.csv")
game_cities=pd.read_csv("WStage2DataFiles/WGameCities.csv")

tourney_seeds=pd.read_csv("WStage2DataFiles/WNCAATourneySeeds.csv")

regular_season_results=pd.read_csv("WStage2DataFiles/WRegularSeasonCompactResults.csv")
detailed_regular_season=pd.read_csv("WStage2DataFiles/WRegularSeasonDetailedResults.csv")

tourney_results=pd.read_csv("WStage2DataFiles/WNCAATourneyCompactResults.csv")
detailed_tourney=pd.read_csv("WStage2DataFiles/WNCAATourneyDetailedResults.csv")

sample_submission=pd.read_csv("WStage2DataFiles/WSampleSubmissionStage2.csv")

print("Regular season results",regular_season_results.shape,detailed_regular_season.shape)
print("Tourney results",tourney_results.shape,detailed_tourney.shape)

Regular season results (101893, 8) (46342, 34)
Tourney results (1260, 8) (504, 34)


In [3]:
# tourney_results=pd.concat([tourney_results,regular_season_results],axis=0)
# tourney_results.shape

In [4]:
# for extraction of seed and region
def extract_seedint(seed):
    k=int(seed[1:])
    return k;
def region(seed):
    return seed[0];
tourney_seeds['seed_int']=tourney_seeds['Seed'].apply(extract_seedint)
tourney_seeds['region']=tourney_seeds['Seed'].apply(region)
tourney_seeds.drop(columns=['Seed'],inplace=True)
tourney_seeds.head(10)

Unnamed: 0,Season,TeamID,seed_int,region
0,1998,3330,1,W
1,1998,3163,2,W
2,1998,3112,3,W
3,1998,3301,4,W
4,1998,3272,5,W
5,1998,3438,6,W
6,1998,3208,7,W
7,1998,3307,8,W
8,1998,3304,9,W
9,1998,3203,10,W


In [5]:
# merging the tournament results
winseeds = tourney_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed','region':'WRegion'})
lossseeds = tourney_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed','region':'LRegion'})
df_dummy = pd.merge(left=tourney_results, right=winseeds, how='left', on=['Season', 'WTeamID'])
tourney_results= pd.merge(left=df_dummy, right=lossseeds,how='left',on=['Season', 'LTeamID'])
tourney_results.WSeed.fillna(0,inplace=True)
tourney_results.LSeed.fillna(0,inplace=True)
tourney_results.head(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,WRegion,LSeed,LRegion
0,1998,137,3104,94,3422,46,H,0,2,X,15,X
1,1998,137,3112,75,3365,63,H,0,3,W,14,W
2,1998,137,3163,93,3193,52,H,0,2,W,15,W
3,1998,137,3198,59,3266,45,H,0,7,Y,10,Y
4,1998,137,3203,74,3208,72,A,0,10,W,7,W
5,1998,137,3234,77,3269,59,H,0,4,Z,13,Z
6,1998,137,3242,72,3408,68,H,0,5,Z,12,Z
7,1998,137,3301,89,3263,64,H,0,4,W,13,W
8,1998,137,3304,76,3307,59,N,0,9,W,8,W
9,1998,137,3314,91,3224,71,H,0,2,Y,15,Y


In [6]:
wins=tourney_results.loc[tourney_results['Season']<yr,['Season','WSeed','LSeed','WScore','LScore']]
wins['SeedDiff']=wins['WSeed']-wins['LSeed']
wins['ScoreDiff']=wins['WScore']-wins['LScore']
wins.drop(columns=['WSeed','LSeed','WScore','LScore'],inplace=True)

wins['Result']=1
wins['Result']=wins['Result'].astype(int)

losses=-wins
losses['Result']=0
losses['Season']=-losses['Season']

data=pd.concat([wins,losses],axis=0)
data.head()

Unnamed: 0,Season,SeedDiff,ScoreDiff,Result
0,1998,-13,48,1
1,1998,-11,12,1
2,1998,-13,41,1
3,1998,-3,14,1
4,1998,3,2,1


In [7]:
X_train=data.iloc[:,[1]].values
y_train=data.iloc[:,2].values

print(X_train.shape,y_train.shape)

(2016, 1) (2016,)


In [8]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,random_state=42,shuffle=True)
print("For regression ")
print("The training shape is",X_train.shape," and the label shape is",y_train.shape)
print("The validation shape is",X_val.shape," and the label shape is",y_val.shape)

For regression 
The training shape is (1814, 1)  and the label shape is (1814,)
The validation shape is (202, 1)  and the label shape is (202,)


My plan is to predict the score differences and this can be used to predict the score differences in the test data 

In [9]:
lr=LinearRegression()
lr.fit(X_train,y_train)
print("The training Error is",lr.score(X_train,y_train))
print("The validation error is",lr.score(X_val,y_val))

The training Error is 0.562418145765
The validation error is 0.580603633846


In [10]:
rf=RandomForestRegressor(random_state=42)
rf.fit(X_train,y_train)
print("The training Score is",rf.score(X_train,y_train))
print("The validation Score is",rf.score(X_val,y_val))

The training Score is 0.583358825931
The validation Score is 0.583543348298


In [11]:
xgr=XGBRegressor(random_state=42)
xgr.fit(X_train,y_train)
print("The training Score is",xgr.score(X_train,y_train))
print("The validation Score is",xgr.score(X_val,y_val))

The training Score is 0.583993674602
The validation Score is 0.59055595799


In [12]:
svr=SVR()
svr.fit(X_train,y_train)
print("training Score",svr.score(X_train,y_train))
print("validation score",svr.score(X_val,y_val))

training Score 0.554556875594
validation score 0.56286949699


In [13]:
ridge=Ridge()
ridge.fit(X_train,y_train)
print("The training Score is",ridge.score(X_train,y_train))
print("The validation Score is",ridge.score(X_val,y_val))

The training Score is 0.562418145708
The validation Score is 0.580603290576


In [14]:
lasso=Lasso()
lasso.fit(X_train,y_train)
print("The training Score is",lasso.score(X_train,y_train))
print("The validation Score is",lasso.score(X_val,y_val))

The training Score is 0.562372464431
The validation Score is 0.580253512283


I will choose the randomforest regressor(just a try!)

In [15]:
pred_data=data.iloc[:,1:2].values
data['DiffbyRegressor']=(xgr.predict(pred_data))
data.head()

Unnamed: 0,Season,SeedDiff,ScoreDiff,Result,DiffbyRegressor
0,1998,-13,48,1,26.988544
1,1998,-11,12,1,19.175648
2,1998,-13,41,1,26.988544
3,1998,-3,14,1,5.767014
4,1998,3,2,1,-6.155714


I will build a classifier on the data using Season,SeedDiff,ScoreDiff by regressor. 

In [16]:
X_train=data.iloc[:,[1,4]].values
y_train=data.iloc[:,3].values
print(X_train.shape,y_train.shape)

(2016, 2) (2016,)


In [17]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,random_state=42,shuffle=True)

print("The training shape is",X_train.shape," and the label shape is",y_train.shape)
print("The validation shape is",X_val.shape," and the label shape is",y_val.shape)

The training shape is (1814, 2)  and the label shape is (1814,)
The validation shape is (202, 2)  and the label shape is (202,)


# Classifiers

In [18]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
print("The training log loss is",log_loss(y_train,lr.predict_proba(X_train)[:,1]))
print("The validation log los is",log_loss(y_val,lr.predict_proba(X_val)[:,1]))

The training log loss is 0.453238515125
The validation log los is 0.453498839933


In [19]:
xgc=XGBClassifier(n_estimators=100,random_state=42)
xgc.fit(X_train,y_train)
print("The training log loss is",log_loss(y_train,xgc.predict_proba(X_train)[:,1]))
print("The validation log los is",log_loss(y_val,xgc.predict_proba(X_val)[:,1]))

The training log loss is 0.446465727793
The validation log los is 0.465703744524


In [20]:
print("The training loss of the ensemble is",log_loss(y_train,0.5*xgc.predict_proba(X_train)[:,1]+0.5*lr.predict_proba(X_train)[:,1]))
print("The validation loss of the ensemble is",log_loss(y_val,0.5*xgc.predict_proba(X_val)[:,1]+0.5*lr.predict_proba(X_val)[:,1]))

The training loss of the ensemble is 0.449293038887
The validation loss of the ensemble is 0.455958337856


## TEST PREPROCESSING

In [21]:
test=pd.DataFrame(columns=['Season','WTeamID','LTeamID'])
for i in sample_submission.index:
    season,wid,lid=map(int,sample_submission['ID'][i].split("_"))
    test.loc[i,'Season']=season
    test.loc[i,'WTeamID']=wid
    test.loc[i,'LTeamID']=lid

test['Season']=test['Season'].astype(int)
test['WTeamID']=test['WTeamID'].astype(int)
test['LTeamID']=test['LTeamID'].astype(int)
test.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,2018,3110,3113
1,2018,3110,3114
2,2018,3110,3124
3,2018,3110,3125
4,2018,3110,3129


In [22]:
df_dummy=pd.merge(left=test,right=winseeds,how='left',on=['Season','WTeamID'])
test=pd.merge(left=df_dummy,right=lossseeds,how='left',on=['Season','LTeamID'])

test.drop(columns=['WRegion','LRegion'],inplace=True)
test['SeedDiff']=test['WSeed']-test['LSeed']
test.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,SeedDiff
0,2018,3110,3113,14,7,7
1,2018,3110,3114,14,14,0
2,2018,3110,3124,14,2,12
3,2018,3110,3125,14,12,2
4,2018,3110,3129,14,16,-2


# predictions of regressor

In [23]:
X_test=test.iloc[:,[5]].values
test['DiffbyRegressor']=xgr.predict(X_test)
test.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,SeedDiff,DiffbyRegressor
0,2018,3110,3113,14,7,7,-12.642154
1,2018,3110,3114,14,14,0,-0.688757
2,2018,3110,3124,14,2,12,-29.073074
3,2018,3110,3125,14,12,2,-8.113288
4,2018,3110,3129,14,16,-2,9.717129


In [24]:
data.head()

Unnamed: 0,Season,SeedDiff,ScoreDiff,Result,DiffbyRegressor
0,1998,-13,48,1,26.988544
1,1998,-11,12,1,19.175648
2,1998,-13,41,1,26.988544
3,1998,-3,14,1,5.767014
4,1998,3,2,1,-6.155714


## predictions of classifier

In [25]:
X_test=test.iloc[:,[5,6]].values
sample_submission['Pred']=xgc.predict_proba(X_test)[:,1]
sample_submission.head()

Unnamed: 0,ID,Pred
0,2018_3110_3113,0.135979
1,2018_3110_3114,0.482808
2,2018_3110_3124,0.004531
3,2018_3110_3125,0.250959
4,2018_3110_3129,0.781655


In [26]:
sample_submission.to_csv("Trying_something_new_stage2_2014.csv",index=False)
print("Done")

Done


In [27]:
X_test=test.iloc[:,[5,6]].values
sample_submission['Pred']=0.5*xgc.predict_proba(X_test)[:,1]+0.5*lr.predict_proba(X_test)[:,1]
sample_submission.head()

Unnamed: 0,ID,Pred
0,2018_3110_3113,0.138128
1,2018_3110_3114,0.485856
2,2018_3110_3124,0.011549
3,2018_3110_3125,0.259705
4,2018_3110_3129,0.779512


In [28]:
sample_submission.to_csv("Trying_something_new_ensemble_stage2_2014.csv")
print("Done")

Done


The result of each submission is given in combo.ipynb