In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from xgboost import XGBRegressor,XGBClassifier
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC,LinearSVR,SVC,SVR
from scipy.stats import norm,skew

# Setting the number of columns to display
pd.set_option('display.max_columns', None)
# year
yr=2014

In [None]:
cities=pd.read_csv("WPrelimData2018/WCities_PrelimData2018.csv")
game_cities=pd.read_csv("WPrelimData2018/WGameCities_PrelimData2018.csv")

tourney_seeds=pd.read_csv("WDataFiles/WNCAATourneySeeds.csv")

regular_season_results=pd.read_csv("WPrelimData2018/WRegularSeasonCompactResults_PrelimData2018.csv")
detailed_regular_season=pd.read_csv("WPrelimData2018/WRegularSeasonDetailedResults_PrelimData2018.csv")

tourney_results=pd.read_csv("WPrelimData2018/WNCAATourneyCompactResults_PrelimData2018.csv")
detailed_tourney=pd.read_csv("WPrelimData2018/WNCAATourneyDetailedResults_PrelimData2018.csv")


sample_submission=pd.read_csv("WDataFiles/WSampleSubmissionStage1.csv")

print("Regular season results",regular_season_results.shape)
print("Tourney results",tourney_results.shape)

In [None]:
# tourney_results=pd.concat([tourney_results,regular_season_results],axis=0)
# tourney_results.shape

In [None]:
# for extraction of seed and region
def extract_seedint(seed):
    k=int(seed[1:])
    return k;
def region(seed):
    return seed[0];
tourney_seeds['seed_int']=tourney_seeds['Seed'].apply(extract_seedint)
tourney_seeds['region']=tourney_seeds['Seed'].apply(region)
tourney_seeds.drop(columns=['Seed'],inplace=True)
tourney_seeds.head(10)

In [None]:
# merging the tournament results
winseeds = tourney_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed','region':'WRegion'})
lossseeds = tourney_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed','region':'LRegion'})
df_dummy = pd.merge(left=tourney_results, right=winseeds, how='left', on=['Season', 'WTeamID'])
tourney_results= pd.merge(left=df_dummy, right=lossseeds,how='left',on=['Season', 'LTeamID'])
tourney_results.WSeed.fillna(0,inplace=True)
tourney_results.LSeed.fillna(0,inplace=True)
tourney_results.head(10)

In [None]:
wins=tourney_results.loc[tourney_results['Season']<yr,['Season','WSeed','LSeed','WScore','LScore']]
wins['SeedDiff']=wins['WSeed']-wins['LSeed']
wins['ScoreDiff']=wins['WScore']-wins['LScore']
wins.drop(columns=['WSeed','LSeed','WScore','LScore'],inplace=True)

wins['Result']=1
wins['Result']=wins['Result'].astype(int)

losses=-wins
losses['Result']=0
losses['Season']=-losses['Season']

data=pd.concat([wins,losses],axis=0)
data.head()

In [None]:
X_train=data.iloc[:,[1]].values
y_train=data.iloc[:,2].values
# log transform
# y_train=(data.iloc[:,2].values)

print(X_train.shape,y_train.shape)

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,random_state=42,shuffle=True)
print("For regression ")
print("The training shape is",X_train.shape," and the label shape is",y_train.shape)
print("The validation shape is",X_val.shape," and the label shape is",y_val.shape)

My plan is to predict the score differences and this can be used to predict the score differences in the test data 

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)
print("The training Error is",lr.score(X_train,y_train))
print("The validation error is",lr.score(X_val,y_val))

In [None]:
rf=RandomForestRegressor(random_state=42)
rf.fit(X_train,y_train)
print("The training Score is",rf.score(X_train,y_train))
print("The validation Score is",rf.score(X_val,y_val))

In [None]:
xgr=XGBRegressor(random_state=42)
xgr.fit(X_train,y_train)
print("The training Score is",xgr.score(X_train,y_train))
print("The validation Score is",xgr.score(X_val,y_val))

In [None]:
svr=SVR()
svr.fit(X_train,y_train)
print("training Score",svr.score(X_train,y_train))
print("validation score",svr.score(X_val,y_val))

In [None]:
ridge=Ridge()
ridge.fit(X_train,y_train)
print("The training Score is",ridge.score(X_train,y_train))
print("The validation Score is",ridge.score(X_val,y_val))

In [None]:
lasso=Lasso()
lasso.fit(X_train,y_train)
print("The training Score is",lasso.score(X_train,y_train))
print("The validation Score is",lasso.score(X_val,y_val))

I will choose the randomforest regressor(just a try!)

In [None]:
pred_data=data.iloc[:,1:2].values
data['DiffbyRegressor']=(xgr.predict(pred_data))
data.head()

I will build a classifier on the data using Season,SeedDiff,ScoreDiff by regressor. 

In [None]:
X_train=data.iloc[:,[1,4]].values
y_train=data.iloc[:,3].values
print(X_train.shape,y_train.shape)

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,random_state=42,shuffle=True)

print("The training shape is",X_train.shape," and the label shape is",y_train.shape)
print("The validation shape is",X_val.shape," and the label shape is",y_val.shape)

# Classifiers

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
print("The training log loss is",log_loss(y_train,lr.predict_proba(X_train)[:,1]))
print("The validation log los is",log_loss(y_val,lr.predict_proba(X_val)[:,1]))

In [None]:
xgc=XGBClassifier(n_estimators=100,random_state=42)
xgc.fit(X_train,y_train)
print("The training log loss is",log_loss(y_train,xgc.predict_proba(X_train)[:,1]))
print("The validation log los is",log_loss(y_val,xgc.predict_proba(X_val)[:,1]))

In [None]:
print("The training loss of the ensemble is",log_loss(y_train,0.5*xgc.predict_proba(X_train)[:,1]+0.5*lr.predict_proba(X_train)[:,1]))
print("The validation loss of the ensemble is",log_loss(y_val,0.5*xgc.predict_proba(X_val)[:,1]+0.5*lr.predict_proba(X_val)[:,1]))

## TEST PREPROCESSING

In [None]:
test=pd.DataFrame(columns=['Season','WTeamID','LTeamID'])
for i in sample_submission.index:
    season,wid,lid=map(int,sample_submission['ID'][i].split("_"))
    test.loc[i,'Season']=season
    test.loc[i,'WTeamID']=wid
    test.loc[i,'LTeamID']=lid

test['Season']=test['Season'].astype(int)
test['WTeamID']=test['WTeamID'].astype(int)
test['LTeamID']=test['LTeamID'].astype(int)
test.head()

In [None]:
df_dummy=pd.merge(left=test,right=winseeds,how='left',on=['Season','WTeamID'])
test=pd.merge(left=df_dummy,right=lossseeds,how='left',on=['Season','LTeamID'])

test.drop(columns=['WRegion','LRegion'],inplace=True)
test['SeedDiff']=test['WSeed']-test['LSeed']
test.head()

# predictions of regressor

In [None]:
X_test=test.iloc[:,[5]].values
test['DiffbyRegressor']=xgr.predict(X_test)
test.head()

In [None]:
data.head()

## predictions of classifier

In [None]:
X_test=test.iloc[:,[5,6]].values
sample_submission['Pred']=xgc.predict_proba(X_test)[:,1]
sample_submission.head()

In [None]:
sample_submission.to_csv("Trying_something_new.csv",index=False)
print("Done")

In [None]:
X_test=test.iloc[:,[5,6]].values
sample_submission['Pred']=0.5*xgc.predict_proba(X_test)[:,1]+0.5*lr.predict_proba(X_test)[:,1]
sample_submission.head()

In [None]:
sample_submission.to_csv("Trying_something_new_ensemble.csv")
print("Done")

The result of each submission is given in combo.ipynb