In [1]:
import pandas as pd
from sklearn import ensemble
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

df = pd.read_csv('NCAA_Tourney_2002-2016_1.csv')
features = df.iloc[:, :-4]
results = df.iloc[:, -4:]

X = features
y = results.loc[:, 'result']
X_train = pd.DataFrame()
X_test = pd.DataFrame()
y_train = pd.DataFrame()
y_test = pd.DataFrame()
X_valid = pd.DataFrame()
y_valid = pd.DataFrame()


for year in range(2002, 2014):
    X_train = pd.concat([X_train, X.loc[results.Season == year]])
    y_train = pd.concat([y_train, y.loc[results.Season == year]])

for year in range(2014, 2017):
    X_test = pd.concat([X_test, X.loc[results.Season == year]])
    y_test = pd.concat([y_test, y.loc[results.Season == year]])
    
paramsGB = {'n_estimators': 10, 'max_depth': 5, 'min_samples_split': 2, 'loss': 'deviance'}

original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

clf = ensemble.GradientBoostingClassifier(**original_params)

clf.fit(X_train, y_train.values.ravel())

y_pred = clf.predict(X_test)

sns.set(font_scale=1, rc={"lines.linewidth": 1.2})

fig = plt.figure(figsize=(20, 10), dpi=80)
fig.suptitle('Predictor Importance', fontsize=20)

plt.subplot(1,1,1)

f = clf.feature_importances_
%matplotlib
f = 100.0 * (f / f.max())
indices = np.argsort(f)[-10:]
plt.barh(np.arange(10), f[indices],color='dodgerblue',alpha=.4)
plt.yticks(np.arange(10 + 0.25), np.array(X.columns)[indices])
plt.xlabel('Relative importance'), \
plt.title('Top Ten Important Variables')

Using matplotlib backend: MacOSX


(<matplotlib.text.Text at 0x1183ca160>, <matplotlib.text.Text at 0x1183ee978>)

In [27]:
from sklearn import linear_model, preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier

for i in range(8,40):
    columns = list(X.columns[np.argsort(f)[-i:]])
    newFeatures = features.loc[:, columns]
    X_new = newFeatures
    y_new = results.loc[:, 'result']
    X_train = pd.DataFrame()
    X_test = pd.DataFrame()
    y_train = pd.DataFrame()
    y_test = pd.DataFrame()
    X_valid = pd.DataFrame()
    y_valid = pd.DataFrame()
    
    
    normolizer = preprocessing.Normalizer().fit(X_new)
    X_new = pd.DataFrame(normolizer.transform(X_new))
    
    for year in range(2002, 2014):
        X_train = pd.concat([X_train, X_new.loc[results.Season == year]])
        y_train = pd.concat([y_train, y_new.loc[results.Season == year]])
        
    for year in range(2002, 2014):
        X_valid = pd.concat([X_valid, X_new.loc[results.Season == year]])
        y_valid = pd.concat([y_valid, y_new.loc[results.Season == year]])
    
    for year in range(2014, 2017):
        X_test = pd.concat([X_test, X_new.loc[results.Season == year]])
        y_test = pd.concat([y_test, y_new.loc[results.Season == year]])
        
    
    #clf = linear_model.LogisticRegressionCV(scoring='neg_log_loss')
    #clf.fit(X_train, y_train.values.ravel())
    
    
    pca = PCA(n_components=i*2//3)
    clf = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
    pipeline = make_pipeline(pca, clf)
    pipeline.fit(X_train, y_train.values.ravel())
    
    sig_clf = CalibratedClassifierCV(pipeline, method="sigmoid")
    sig_clf.fit(X_valid, y_valid.values.ravel())
    
    
    y_pred = sig_clf.predict_proba(X_test)
    logLoss = log_loss(y_true=y_test, y_pred=y_pred)
    
    print(logLoss, '*******', i)

0.635488171257 ******* 8


0.635577449265 ******* 9


0.636908910479 ******* 10


0.637357224927 ******* 11


0.596148783876 ******* 12


0.595863454536 ******* 13


0.557163286288 ******* 14
