In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from xgboost import XGBRegressor,XGBClassifier
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC,LinearSVR,SVC,SVR
from scipy.stats import norm,skew

# Setting the number of columns to display
pd.set_option('display.max_columns', None)
# year
yr=2014

### detailed is from 2010 to 2018. but i am using from this 2010 to 2013

these are the full forms 

WFGM - field goals made (by the winning team)

WFGA - field goals attempted (by the winning team)

WFGM3 - three pointers made (by the winning team)

WFGA3 - three pointers attempted (by the winning team)

WFTM - free throws made (by the winning team)

WFTA - free throws attempted (by the winning team)

WOR - offensive rebounds (pulled by the winning team)

WDR - defensive rebounds (pulled by the winning team)

WAst - assists (by the winning team)

WTO - turnovers committed (by the winning team)

WStl - steals (accomplished by the winning team)

WBlk - blocks (accomplished by the winning team)

WPF - personal fouls committed (by the winning team)

In [None]:
cities=pd.read_csv("WPrelimData2018/WCities_PrelimData2018.csv")
game_cities=pd.read_csv("WPrelimData2018/WGameCities_PrelimData2018.csv")

tourney_seeds=pd.read_csv("WDataFiles/WNCAATourneySeeds.csv")

regular_season_results=pd.read_csv("WPrelimData2018/WRegularSeasonCompactResults_PrelimData2018.csv")
detailed_regular_season=pd.read_csv("WPrelimData2018/WRegularSeasonDetailedResults_PrelimData2018.csv")

tourney_results=pd.read_csv("WPrelimData2018/WNCAATourneyCompactResults_PrelimData2018.csv")
detailed_tourney=pd.read_csv("WPrelimData2018/WNCAATourneyDetailedResults_PrelimData2018.csv")


# regular_season_results=regular_season_results.loc[regular_season_results['Season']<=yr]
# tourney_results=tourney_results.loc[tourney_results['Season']<yr]

detailed_regular_season=detailed_regular_season.loc[detailed_regular_season['Season']<=yr]
detailed_tourney=detailed_tourney.loc[detailed_tourney['Season']<yr]

sample_submission=pd.read_csv("WDataFiles/WSampleSubmissionStage1.csv")

print("Regular season results",regular_season_results.shape,detailed_regular_season.shape)
print("Tourney results",tourney_results.shape,detailed_tourney.shape)

In [None]:
detailed_tourney['Type']=1          # tournament
detailed_regular_season['Type']=0      #regular season
detailed_tourney=pd.concat([detailed_tourney,detailed_regular_season],axis=0,ignore_index=True)
print(detailed_tourney.shape)
detailed_tourney.head()

In [None]:
# adding seeds to tournnament results
def extract_seedint(seed):
    k=int(seed[1:])
    return k;
def region(seed):
    return seed[0];
tourney_seeds['seed_int']=tourney_seeds['Seed'].apply(extract_seedint)
tourney_seeds['region']=tourney_seeds['Seed'].apply(region)
tourney_seeds.drop(columns=['Seed'],inplace=True)

winseeds = tourney_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed','region':'WRegion'})
lossseeds = tourney_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed','region':'LRegion'})
df_dummy = pd.merge(left=tourney_results, right=winseeds, how='left', on=['Season', 'WTeamID'])
tourney_results= pd.merge(left=df_dummy, right=lossseeds,how='left',on=['Season', 'LTeamID'])

In [None]:
# the formulae for the possession in the paper is wrong..!!!
detailed_tourney['WPossessions']=detailed_tourney['WFGA']-detailed_tourney['WOR']+detailed_tourney['WTO']+0.475*detailed_tourney['WFTA']
detailed_tourney['LPossessions']=detailed_tourney['LFGA']-detailed_tourney['LOR']+detailed_tourney['LTO']+0.475*detailed_tourney['LFTA']

# in a game the possesion of the two teams are nearly same(+2 or -2). So we can take the average
# the details is given in the link  https://kenpom.com/blog/stats-explained/
detailed_tourney['Possessions']=detailed_tourney['WPossessions']*0.5+detailed_tourney['LPossessions']*0.5

# offensive and defensive effiencies by dividing the effincies with the average number
detailed_tourney['WOE']=(detailed_tourney['WScore']*100)/detailed_tourney['Possessions']
detailed_tourney['WDE']=(detailed_tourney['LScore']*100)/detailed_tourney['Possessions']

detailed_tourney['LOE']=(detailed_tourney['LScore']*100)/detailed_tourney['Possessions']
detailed_tourney['LDE']=(detailed_tourney['WScore']*100)/detailed_tourney['Possessions']

# adding seeds 
df_dummy = pd.merge(left=detailed_tourney, right=winseeds, how='left', on=['Season', 'WTeamID'])
detailed_tourney= pd.merge(left=df_dummy, right=lossseeds,how='left',on=['Season', 'LTeamID'])

detailed_tourney['WSeed'].fillna(0,inplace=True)
detailed_tourney['LSeed'].fillna(0,inplace=True)

# finding the seed difference
detailed_tourney['SeedDiff']=detailed_tourney['WSeed']-detailed_tourney['LSeed']

# dropping the region columns 
detailed_tourney.drop(columns=['WRegion','LRegion'],inplace=True)




print(detailed_tourney.shape)
detailed_tourney.head()

I will calculate the national offensive and defensive effi

May I will use this to find the adjusted offensive and defensive effi. But currently I dont know any formuale for calculating the adjusted effi

In [None]:
# now i have to calculate the adjusted OE,DE for winning team and losing team
# before that I have to calculate the team offensive and defensive efficiency overall games 
woe=pd.DataFrame()
wde=pd.DataFrame()
woe['TeamID']=detailed_tourney.groupby('WTeamID').WOE.sum().index
woe['wOE']=detailed_tourney.groupby('WTeamID').WOE.sum().values

wde['TeamID']=detailed_tourney.groupby('WTeamID').WDE.sum().index
wde['wDE']=detailed_tourney.groupby('WTeamID').WDE.sum().values

loe=pd.DataFrame()
lde=pd.DataFrame()
loe['TeamID']=detailed_tourney.groupby('LTeamID').LOE.sum().index
loe['lOE']=detailed_tourney.groupby('LTeamID').LOE.sum().values

lde['TeamID']=detailed_tourney.groupby('LTeamID').LDE.sum().index
lde['lDE']=detailed_tourney.groupby('LTeamID').LDE.sum().values

oe=pd.merge(left=woe,right=loe,how='outer',on=['TeamID'])
de=pd.merge(left=wde,right=lde,how='outer',on=['TeamID'])

oe.fillna(0,inplace=True)
de.fillna(0,inplace=True)

oe['OE']=oe['wOE']+oe['lOE']
oe.drop(columns=['wOE','lOE'],inplace=True)

de['DE']=de['wDE']+de['lDE']
de.drop(columns=['wDE','lDE'],inplace=True)

# number of games 
wgames=pd.DataFrame()
wgames['TeamID']=detailed_tourney['WTeamID'].value_counts().index
wgames['wcount']=detailed_tourney['WTeamID'].value_counts().values

lgames=pd.DataFrame()
lgames['TeamID']=detailed_tourney['LTeamID'].value_counts().index
lgames['lcount']=detailed_tourney['LTeamID'].value_counts().values


games=pd.merge(left=wgames,right=lgames,how='outer',on=['TeamID'])
games['wcount'].fillna(0,inplace=True)
games['lcount'].fillna(0,inplace=True)
games['number']=games['wcount']+games['lcount']
games.drop(columns=['wcount','lcount'],inplace=True)

oe=pd.merge(left=oe,right=games,how='outer',on=['TeamID'])
de=pd.merge(left=de,right=games,how='outer',on=['TeamID'])

oe['OE']=oe['OE']/oe['number']
de['DE']=de['DE']/de['number']

oe.drop(columns=['number'],inplace=True)
de.drop(columns=['number'],inplace=True)

print(oe.shape,de.shape)

national_oe_average=(oe['OE'].sum())/(oe['OE'].shape[0])
national_de_average=(de['DE'].sum())/(de['DE'].shape[0])
print("The national offensive average is",national_oe_average)
print("The national defensive average is",national_de_average)

## TRAIN DATA PREPROCESSING

In [None]:
def change(s):
    if s=='N':
        return 0;
    elif s=='H':
        return 1;
    else:
        return -1;

    

df_dummy=pd.merge(left=detailed_tourney
                  ,right=oe.rename(columns={'TeamID':'WTeamID','OE':'WOE_avg'}),
                  how='left',on=['WTeamID'])

detailed_tourney=pd.merge(left=df_dummy
                           ,right=de.rename(columns={'TeamID':'WTeamID','DE':'WDE_avg'})
                           ,how='left',on=['WTeamID'])

df_dummy=pd.merge(left=detailed_tourney,
                  right=oe.rename(columns={'TeamID':'LTeamID','OE':'LOE_avg'}),
                  how='left',on=['LTeamID'])

detailed_tourney=pd.merge(left=df_dummy,
                          right=de.rename(columns={'TeamID':'LTeamID','DE':'LDE_avg'}),
                          how='left',on=['LTeamID'])

# finding the effi margin
detailed_tourney['WEM']=detailed_tourney['WOE_avg']-detailed_tourney['WDE_avg']
detailed_tourney['LEM']=detailed_tourney['LOE_avg']-detailed_tourney['LDE_avg']

# adjusted effi/ This is what i understood from the definition https://kenpom.com/blog/ratings-glossary/
detailed_tourney['']



detailed_tourney=detailed_tourney.loc[:,['WLoc','WOE_avg','WDE_avg','LOE_avg','LDE_avg','Type','SeedDiff','WEM','LEM',]]
detailed_tourney['WLoc']=detailed_tourney['WLoc'].apply(change)
detailed_tourney['Result']=1

temp=pd.DataFrame()

# creating features
temp['WLoc']=-detailed_tourney['WLoc']

temp['WOE_avg']=detailed_tourney['LOE_avg']

temp['WDE_avg']=detailed_tourney['LDE_avg']

temp['LOE_avg']=detailed_tourney['WOE_avg']

temp['LDE_avg']=detailed_tourney['WDE_avg']

temp['Type']=detailed_tourney['Type']

temp['SeedDiff']=-detailed_tourney['SeedDiff']

temp['WEM']=detailed_tourney['LEM']

temp['LEM']=detailed_tourney['WEM']

temp['Result']=0

games=pd.concat([detailed_tourney,temp],axis=0,ignore_index=True)
# games=games.loc[games['Type']==1]
print(games.shape)
games.head()

## TEST PREPREOCESSING

In [None]:
def yr(s):
    yr,team1,team2=map(int,s.split('_'))
    return yr;

def t1(s):
    yr,team1,team2=map(int,s.split('_'))
    return team1;

def t2(s):
    yr,team1,team2=map(int,s.split('_'))
    return team2;

sample_submission['Season']=sample_submission['ID'].apply(yr)
sample_submission['WTeamID']=sample_submission['ID'].apply(t1)
sample_submission['LTeamID']=sample_submission['ID'].apply(t2)


df_dummy=pd.merge(left=sample_submission,
                  right=oe.rename(columns={'TeamID':'WTeamID','OE':'WOE_avg'}),
                  how='left',on=['WTeamID'])

sample_submission=pd.merge(left=df_dummy,
                           right=de.rename(columns={'TeamID':'WTeamID','DE':'WDE_avg'}),
                           how='left',on=['WTeamID'])

df_dummy=pd.merge(left=sample_submission,
                  right=oe.rename(columns={'TeamID':'LTeamID','OE':'LOE_avg'}),
                  how='left',on=['LTeamID'])

sample_submission=pd.merge(left=df_dummy,
                           right=de.rename(columns={'TeamID':'LTeamID','DE':'LDE_avg'}),
                           how='left',on=['LTeamID'])
#adding effi margin
sample_submission['WEM']=sample_submission['WOE_avg']-sample_submission['WDE_avg']
sample_submission['LEM']=sample_submission['LOE_avg']-sample_submission['LDE_avg']


# adding seeds 
df_dummy = pd.merge(left=sample_submission, right=winseeds, how='left', on=['Season', 'WTeamID'])
sample_submission= pd.merge(left=df_dummy, right=lossseeds,how='left',on=['Season', 'LTeamID'])

sample_submission['WSeed'].fillna(0,inplace=True)
sample_submission['LSeed'].fillna(0,inplace=True)

sample_submission['SeedDiff']=sample_submission['WSeed']-sample_submission['LSeed']
sample_submission.drop(columns=['WRegion','LRegion'],inplace=True)

sample_submission['WLoc']=0
sample_submission['Type']=1

sample_submission.head()

In [None]:
X_test=sample_submission.loc[:,['WLoc','WOE_avg','WDE_avg','LOE_avg','LDE_avg','SeedDiff']].values

## TRAINING

In [None]:
X=games.loc[:,['WLoc','WOE_avg','WDE_avg','LOE_avg','LDE_avg','SeedDiff','WEM','LEM']].values
y=games.loc[:,'Result'].values

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.1,shuffle=True,random_state=42)


print("Training Shape",X_train.shape)
print("Training Label Shape",y_train.shape)
print("Validation Shape",X_val.shape)
print("Validation Label Shape",y_val.shape)
print("Test Shape",X_test.shape)

## LOGISTIC REGRESSION

In [None]:
param={'C':[0.001,0.003,0.006,0.01,0.02,0.03,0.04,0.05,0.06,1,3,6,10]}
lr=LogisticRegression()
gridsearch=GridSearchCV(lr,param_grid=param,scoring='neg_log_loss',cv=5)
gridsearch.fit(X_train,y_train)
print("The Best C is",gridsearch.best_params_['C'])

In [None]:
lr=LogisticRegression(C=gridsearch.best_params_['C'])
lr.fit(X_train,y_train)
print("The Training Log loss is",log_loss(y_train,lr.predict_proba(X_train)[:,1]))
print("The Validation Log loss is",log_loss(y_val,lr.predict_proba(X_val)[:,1]))

In [None]:
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
print("The Training Log loss is",log_loss(y_train,xgb.predict_proba(X_train)[:,1]))
print("The Validation Log loss is",log_loss(y_val,xgb.predict_proba(X_val)[:,1]))

In [None]:
sample_submission['Pred']=xgb.predict_proba(X_test)[:,1]
ans=sample_submission.loc[:,['ID','Pred']]
ans.head()

In [None]:
tourney_results.loc[tourney_results['Season']==2014].tail(10)

In [None]:
sample_submission.Pred.value_counts()

In [None]:
ans.to_csv("dont_trust.csv",index=False)
print("Done")