In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
import random
from sklearn.externals.six import StringIO
import pydotplus
from sklearn.tree import export_graphviz
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from graphviz import Source
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RandomizedLogisticRegression

In [2]:
abt = pd.read_csv('C:/Users/Mark/kenpom/data/abt.csv')

In [3]:
y = abt['Tourney']
x = abt.iloc[:,6:]
ynew = y.loc[abt['Season']<2016]
ynew = ynew.loc[abt['Season']>=2009]
xnew = x.loc[abt['Season']<2016]
xnew = xnew.loc[abt['Season']>=2009]
yfinal = y.loc[abt['Season']==2016]
xfinal = x.loc[abt['Season']==2016]
yclf = y.reshape(-1,1)
ynewclf = ynew.reshape(-1,1)
yfinalclf = yfinal.reshape(-1,1)
xclf = np.array(x)
xnewclf = np.array(xnew)
xfinalclf = np.array(xfinal)
indexseason = [2009,2010,2011,2012,2013,2014,2015]

In [4]:
def ensemble1(xtr,ytr,xte,yte):
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    clf4 = tree.DecisionTreeClassifier()

    eclf1 = VotingClassifier(estimators=[
            ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('dt', clf4)], voting='hard', weights=[4,4,1,2])
    eclf1 = eclf1.fit(xtr, ytr)

    eclf2 = VotingClassifier(estimators=[
            ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('dt', clf4)],
            voting='soft', weights=[4,4,1,2])
    eclf2 = eclf2.fit(xtr, ytr)

    clf1 = clf1.fit(xtr,ytr)
    clf2 = clf2.fit(xtr,ytr)
    clf3 = clf3.fit(xtr,ytr)
    clf4 = clf4.fit(xtr,ytr)

    print('Without RLR:')
    print('CLF 1: ',clf1.score(xte,yte))
    print('CLF 2: ',clf2.score(xte,yte))
    print('CLF 3: ',clf3.score(xte,yte))
    print('CLF 4: ',clf4.score(xte,yte))

    print('ECLF 1: ',eclf1.score(xte,yte))
    print('ECLF 2: ',eclf2.score(xte,yte))

In [5]:
def ensemble2(xtr,ytr,xte,yte):
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    clf4 = tree.DecisionTreeClassifier()
    rlr = RandomizedLogisticRegression()

    rlrxtrain = rlr.fit_transform(xtr,ytr)
    rlrxtest = rlr.transform(xte)

    eclf1 = VotingClassifier(estimators=[
            ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('dt', clf4)], voting='hard', weights=[4,4,1,2])
    eclf1 = eclf1.fit(rlrxtrain, ytr)

    eclf2 = VotingClassifier(estimators=[
            ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('dt', clf4)],
            voting='soft', weights=[4,4,1,2])
    eclf2 = eclf2.fit(rlrxtrain, ytr)

    clf1 = clf1.fit(rlrxtrain,ytr)
    clf2 = clf2.fit(rlrxtrain,ytr)
    clf3 = clf3.fit(rlrxtrain,ytr)
    clf4 = clf4.fit(rlrxtrain,ytr)

    print('With RLR:')
    print('CLF 1: ',clf1.score(rlrxtest,yte))
    print('CLF 2: ',clf2.score(rlrxtest,yte))
    print('CLF 3: ',clf3.score(rlrxtest,yte))
    print('CLF 4: ',clf4.score(rlrxtest,yte))

    print('ECLF 1: ',eclf1.score(rlrxtest,yte))
    print('ECLF 2: ',eclf2.score(rlrxtest,yte))

In [6]:
def comparemodels(x,y,index):
    testindex = random.sample(index,2)
    testindex
    trainindex = indexseason.copy()
    trainindex.remove(testindex[0])
    trainindex.remove(testindex[1])
    xtest = np.array(x.loc[abt['Season'].isin(testindex)])
    xtrain = np.array(x.loc[abt['Season'].isin(trainindex)])
    ytest = np.array(y.loc[abt['Season'].isin(testindex)])
    ytrain = np.array(y.loc[abt['Season'].isin(trainindex)])
    print(testindex)
    ensemble1(xtrain,ytrain,xtest,ytest)
    ensemble2(xtrain,ytrain,xtest,ytest)

In [7]:
comparemodels(x,y,indexseason)

[2014, 2012]
Without RLR:
CLF 1:  0.899425287356
CLF 2:  0.890804597701
CLF 3:  0.830459770115
CLF 4:  0.844827586207
ECLF 1:  0.902298850575
ECLF 2:  0.89224137931
With RLR:
CLF 1:  0.916666666667
CLF 2:  0.906609195402
CLF 3:  0.853448275862
CLF 4:  0.859195402299
ECLF 1:  0.908045977011
ECLF 2:  0.905172413793


In [8]:
comparemodels(x,y,indexseason)

[2013, 2010]
Without RLR:
CLF 1:  0.914985590778
CLF 2:  0.890489913545
CLF 3:  0.829971181556
CLF 4:  0.871757925072
ECLF 1:  0.907780979827
ECLF 2:  0.910662824207
With RLR:
CLF 1:  0.912103746398
CLF 2:  0.917867435159
CLF 3:  0.835734870317
CLF 4:  0.874639769452
ECLF 1:  0.914985590778
ECLF 2:  0.907780979827


In [9]:
comparemodels(x,y,indexseason)

[2009, 2015]
Without RLR:
CLF 1:  0.890647482014
CLF 2:  0.873381294964
CLF 3:  0.814388489209
CLF 4:  0.856115107914
ECLF 1:  0.896402877698
ECLF 2:  0.887769784173
With RLR:
CLF 1:  0.889208633094
CLF 2:  0.89928057554
CLF 3:  0.854676258993
CLF 4:  0.869064748201
ECLF 1:  0.902158273381
ECLF 2:  0.90071942446


In [10]:
comparemodels(x,y,indexseason)

[2015, 2014]
Without RLR:
CLF 1:  0.881766381766
CLF 2:  0.877492877493
CLF 3:  0.820512820513
CLF 4:  0.823361823362
ECLF 1:  0.887464387464
ECLF 2:  0.89886039886
With RLR:
CLF 1:  0.904558404558
CLF 2:  0.894586894587
CLF 3:  0.863247863248
CLF 4:  0.84188034188
ECLF 1:  0.913105413105
ECLF 2:  0.903133903134


In [11]:
comparemodels(x,y,indexseason)

[2015, 2013]
Without RLR:
CLF 1:  0.893982808023
CLF 2:  0.873925501433
CLF 3:  0.822349570201
CLF 4:  0.873925501433
ECLF 1:  0.893982808023
ECLF 2:  0.904011461318
With RLR:
CLF 1:  0.898280802292
CLF 2:  0.9111747851
CLF 3:  0.842406876791
CLF 4:  0.869627507163
ECLF 1:  0.905444126074
ECLF 2:  0.908309455587


In [12]:
comparemodels(x,y,indexseason)

[2014, 2013]
Without RLR:
CLF 1:  0.909742120344
CLF 2:  0.89111747851
CLF 3:  0.833810888252
CLF 4:  0.869627507163
ECLF 1:  0.908309455587
ECLF 2:  0.905444126074
With RLR:
CLF 1:  0.901146131805
CLF 2:  0.892550143266
CLF 3:  0.855300859599
CLF 4:  0.835243553009
ECLF 1:  0.901146131805
ECLF 2:  0.895415472779


In [13]:
comparemodels(x,y,indexseason)

[2011, 2013]
Without RLR:
CLF 1:  0.893063583815
CLF 2:  0.906069364162
CLF 3:  0.830924855491
CLF 4:  0.859826589595
ECLF 1:  0.910404624277
ECLF 2:  0.908959537572
With RLR:
CLF 1:  0.901734104046
CLF 2:  0.897398843931
CLF 3:  0.846820809249
CLF 4:  0.865606936416
ECLF 1:  0.904624277457
ECLF 2:  0.900289017341


In [14]:
comparemodels(x,y,indexseason)

[2009, 2014]
Without RLR:
CLF 1:  0.871942446043
CLF 2:  0.879136690647
CLF 3:  0.821582733813
CLF 4:  0.844604316547
ECLF 1:  0.889208633094
ECLF 2:  0.884892086331
With RLR:
CLF 1:  0.893525179856
CLF 2:  0.884892086331
CLF 3:  0.85035971223
CLF 4:  0.857553956835
ECLF 1:  0.892086330935
ECLF 2:  0.886330935252


In [15]:
comparemodels(x,y,indexseason)

[2013, 2009]
Without RLR:
CLF 1:  0.884225759768
CLF 2:  0.895803183792
CLF 3:  0.819102749638
CLF 4:  0.862518089725
ECLF 1:  0.897250361795
ECLF 2:  0.897250361795
With RLR:
CLF 1:  0.888567293777
CLF 2:  0.894356005789
CLF 3:  0.839363241679
CLF 4:  0.848046309696
ECLF 1:  0.894356005789
ECLF 2:  0.89001447178


In [16]:
comparemodels(x,y,indexseason)

[2014, 2012]
Without RLR:
CLF 1:  0.899425287356
CLF 2:  0.890804597701
CLF 3:  0.830459770115
CLF 4:  0.847701149425
ECLF 1:  0.908045977011
ECLF 2:  0.89224137931
With RLR:
CLF 1:  0.918103448276
CLF 2:  0.902298850575
CLF 3:  0.853448275862
CLF 4:  0.826149425287
ECLF 1:  0.913793103448
ECLF 2:  0.916666666667


In [17]:
comparemodels(x,y,indexseason)

[2012, 2010]
Without RLR:
CLF 1:  0.900289017341
CLF 2:  0.893063583815
CLF 3:  0.825144508671
CLF 4:  0.859826589595
ECLF 1:  0.911849710983
ECLF 2:  0.914739884393
With RLR:
CLF 1:  0.923410404624
CLF 2:  0.914739884393
CLF 3:  0.833815028902
CLF 4:  0.868497109827
ECLF 1:  0.921965317919
ECLF 2:  0.910404624277


In [18]:
xvalidation = np.array(x.loc[abt['Season'].isin(indexseason)])
yvalidation = np.array(y.loc[abt['Season'].isin(indexseason)])

In [20]:
ensemble2(xvalidation,yvalidation,xfinalclf,yfinalclf)

With RLR:
CLF 1:  0.88603988604
CLF 2:  0.88603988604
CLF 3:  0.840455840456
CLF 4:  0.834757834758
ECLF 1:  0.894586894587
ECLF 2:  0.888888888889
