In [None]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
import random

import itertools

from matplotlib import pyplot as plt
import seaborn as sns

import io
from scipy import misc

import time

%matplotlib inline

In [None]:
def importData(fileName):
    finalData = pd.read_csv(fileName)
    finalData.count()

    from sklearn import preprocessing
    

    for column in finalData.columns:
        if finalData[column].dtype == type(object):
            le = preprocessing.LabelEncoder()
            finalData[column] = le.fit_transform(finalData[column])
            
    return finalData.sample(frac = 1)

In [None]:
def setupData(finalData, fraction):
    data, test = train_test_split(finalData, test_size = 0.2)
    data = data.sample(frac = fraction)
    features = ["Sex", "Age", "Height", "Weight", "Team", "Sport", "Event"]

    X_data = data[features]
    y_data = data['Medal']

    X_test = test[features]
    y_test = test["Medal"]
    return (X_data, y_data, X_test, y_test)

In [None]:
def makePlots(data, fileName):
    print(data)
    numbers, scores, trainTimes, testTimes, trainAcc = [i[0] for i in data], [i[1] for i in data], [i[2] for i in data], [i[3] for i in data], [i[4] for i in data]
    
    plotData = [('numbers', numbers), ('scores', scores), ('train times', trainTimes), ('testTimes', testTimes), ('train accuracy', trainAcc)]
    
    df = pd.DataFrame.from_items(plotData)
    df.to_csv(fileName)
    
    
    #leanring curve
    plt.plot(numbers, scores, 'bo')
    plt.title("Learning Curve")
    plt.xlabel("Number of Examples")
    plt.ylabel("% Accuracy")
    plt.show()
    #train curve time
    plt.plot(numbers, trainTimes, 'ro')
    plt.title("Train Timing Curve")
    plt.xlabel("Number of Examples")
    plt.ylabel("Time (Seconds)")
    plt.show()
    #test curve time
    plt.plot(numbers, testTimes, 'ro')
    plt.title("Test Timing Curve")
    plt.xlabel("Number of Examples")
    plt.ylabel("Time (Seconds)")
    plt.show()
    

In [None]:
def storeHyperParameters(accuracies, fileName):
    df = pd.DataFrame(list(accuracies.items()), columns = ['Hyper Parameter', 'Accuracy'])
    df.to_csv(fileName)
    

# Decision Trees

In [None]:
def trainandTestDecisionTree(data, bestSplit):
    from sklearn.model_selection import KFold
    from sklearn.metrics import accuracy_score
    
    X_data, y_data, X_test, y_test = data
    
    if bestSplit is None:
        n = 5
        kf = KFold(n_splits = n)
        kf.get_n_splits(X_data)
        hyperParameters = [x*2 for x in range(1, 100)]
        accuracies = {}

        for hyperParameter in hyperParameters:
            averageAccuracy = 0
            for train_index, validate_index in kf.split(X_data):
                c = DecisionTreeClassifier(min_samples_split = hyperParameter)
                X_train, X_validate = X_data.iloc[train_index], X_data.iloc[validate_index]
                y_train, y_validate = y_data.iloc[train_index], y_data.iloc[validate_index]
                dt = c.fit(X_train, y_train)
                y_pred = c.predict(X_validate)
                score = accuracy_score(y_validate, y_pred) *100
                averageAccuracy += score
            averageAccuracy = averageAccuracy/n
            print("for the hyperParameter %d, the accuracy is %f" % (hyperParameter, averageAccuracy))
            accuracies[hyperParameter] = averageAccuracy

        print("finished \n")
        bestSplit = max(accuracies, key=accuracies.get)
        storeHyperParameters(accuracies, 'decisionTreeHyperParameters.csv');
        
        print("best hyper parameter: %d" % (max(accuracies, key=accuracies.get)))

    print("Training Final Decision Tree Learner")
    c = DecisionTreeClassifier(min_samples_split = bestSplit)
    
    start = time.time()
    dt = c.fit(X_data, y_data)
    end = time.time()
    trainTime = end - start
    
    start = time.time()
    y_pred = c.predict(X_test)
    end = time.time()
    testTime = end - start
    
    score = accuracy_score(y_test, y_pred) *100
    print("The Score is %f" % (score))
    
    y_train_pred = c.predict(X_data)
    trainAcc = accuracy_score(y_data, y_train_pred) *100
    
    return (score, trainTime, testTime, bestSplit, trainAcc)

In [None]:
finalData = importData('finalDataOlympics.csv')

#loop through different data sizes and get curve
fractions = [i*(1.0/40.0) for i in range(1, 40)]
fractions = list(reversed(fractions))

outputData = []

bestHyperParameter = None;

for i in fractions:
    print("current iteration fraction: %f" % (i))
    data = setupData(finalData, i)
    testValues = trainandTestDecisionTree(data, bestHyperParameter)
    bestHyperParameter = testValues[3]
    X_data = data[0]
    outputData.append((X_data.shape[0], testValues[0], testValues[1], testValues[2], testValues[4]))
    
makePlots(outputData, 'decisionTreeOutput.csv')
    

# K Nearest Neighbors

In [None]:
def trainandTestKNN(data, besthyperParameter):
    from sklearn.model_selection import KFold
    from sklearn.metrics import accuracy_score
    
    X_data, y_data, X_test, y_test = data
    
    n = 5
    kf = KFold(n_splits = n)
    kf.get_n_splits(X_data)
    hyperParameterK = [x for x in range(1, 25)]
    hyperParameterP = range(1, 5)
    hyperParameters = list(itertools.product(hyperParameterK, hyperParameterP))
    #hyperParameters = random.sample(hyperParameters, 50)
    accuracies = {}

    if besthyperParameter is None:
        for hyperParameter in hyperParameters:
            averageAccuracy = 0
            for train_index, validate_index in kf.split(X_data):
                c = KNeighborsClassifier(n_neighbors = hyperParameter[0], p = hyperParameter[1])
                X_train, X_validate = X_data.iloc[train_index], X_data.iloc[validate_index]
                y_train, y_validate = y_data.iloc[train_index], y_data.iloc[validate_index]
                dt = c.fit(X_train, y_train)
                y_pred = c.predict(X_validate)
                score = accuracy_score(y_validate, y_pred) *100
                averageAccuracy += score
            averageAccuracy = averageAccuracy/n
            print("for the hyperParameter %d, %d, the accuracy is %f" % (hyperParameter[0], hyperParameter[1], averageAccuracy))
            accuracies[hyperParameter] = averageAccuracy
            storeHyperParameters(accuracies, 'KNNHyperParameters.csv')

        print("finished \n")
        besthyperParameter = max(accuracies, key=accuracies.get)
        print("best hyper parameter: %d, %d" % (besthyperParameter[0], besthyperParameter[1]))

    print("Training Final KNN Learner")
    c = KNeighborsClassifier(n_neighbors = besthyperParameter[0], p = besthyperParameter[1])
    
    start = time.time()
    dt = c.fit(X_data, y_data)
    end = time.time()
    trainTime = end - start
    
    start = time.time()
    y_pred = c.predict(X_test)
    end = time.time()
    testTime = end - start
    
    score = accuracy_score(y_test, y_pred) *100
    print("The Score is %f" % (score))
    
    y_train_pred = c.predict(X_data)
    trainAcc = accuracy_score(y_data, y_train_pred) *100
    
    return (score, trainTime, testTime, besthyperParameter, trainAcc)

In [None]:
finalData = importData('finalDataOlympics.csv')

#loop through different data sizes and get curve
fractions = [i*(1.0/20) for i in range(1, 20)]
fractions = list(reversed(fractions))

besthyperParameter = None

outputData = []

for i in fractions:
    print("current iteration fraction: %f" % (i))
    data = setupData(finalData, i)
    testValues = trainandTestKNN(data, besthyperParameter)
    X_data = data[0]
    besthyperParameter = testValues[3]
    outputData.append((X_data.shape[0], testValues[0], testValues[1], testValues[2], testValues[4]))
    
makePlots(outputData, 'KNNOutput.csv')

# Boosting

In [None]:
def trainandTestBoosting(data, besthyperParameter):
    from sklearn.model_selection import KFold
    from sklearn.metrics import accuracy_score
    
    X_data, y_data, X_test, y_test = data
    
    n = 5
    kf = KFold(n_splits = n)
    kf.get_n_splits(X_data)
    hyperParameterEstimators = [x*2 for x in range(1, 50)]
    hyperParameterLearningRate = [i * .05 for i in range(1, 100)]
    hyperParameters = list(itertools.product(hyperParameterEstimators, hyperParameterLearningRate))
    hyperParameters = random.sample(hyperParameters, 50)
    accuracies = {}

    if besthyperParameter is None:
        for hyperParameter in hyperParameters:
            averageAccuracy = 0
            for train_index, validate_index in kf.split(X_data):
                c = AdaBoostClassifier(n_estimators = hyperParameter[0], learning_rate = hyperParameter[1])
                X_train, X_validate = X_data.iloc[train_index], X_data.iloc[validate_index]
                y_train, y_validate = y_data.iloc[train_index], y_data.iloc[validate_index]
                dt = c.fit(X_train, y_train)
                y_pred = c.predict(X_validate)
                score = accuracy_score(y_validate, y_pred) *100
                averageAccuracy += score
            averageAccuracy = averageAccuracy/n
            print("for the hyperParameter %d, %d, the accuracy is %f" % (hyperParameter[0], hyperParameter[1], averageAccuracy))
            accuracies[hyperParameter] = averageAccuracy
            storeHyperParameters(accuracies, 'BoostingHyperParameters.csv')

        print("finished \n")
        besthyperParameter = max(accuracies, key=accuracies.get)
        print("best hyper parameter: %f, %f" % (besthyperParameter[0], besthyperParameter[1]))

    print("Training Final Boosting Learner")
    c = AdaBoostClassifier(n_estimators = besthyperParameter[0], learning_rate = besthyperParameter[1])
    
    start = time.time()
    dt = c.fit(X_data, y_data)
    end = time.time()
    trainTime = end - start
    
    start = time.time()
    y_pred = c.predict(X_test)
    end = time.time()
    testTime = end - start
    
    score = accuracy_score(y_test, y_pred) *100
    print("The Score is %f" % (score))
    
    y_train_pred = c.predict(X_data)
    trainAcc = accuracy_score(y_data, y_train_pred) *100
    
    return (score, trainTime, testTime, besthyperParameter, trainAcc)

In [None]:
finalData = importData('finalDataOlympics.csv')

#loop through different data sizes and get curve
fractions = [i*(1.0/20) for i in range(1, 20)]
fractions = list(reversed(fractions))

besthyperParameter = None

outputData = []

for i in fractions:
    print("current iteration fraction: %f" % (i))
    data = setupData(finalData, i)
    testValues = trainandTestBoosting(data, besthyperParameter)
    X_data = data[0]
    besthyperParameter = testValues[3]
    outputData.append((X_data.shape[0], testValues[0], testValues[1], testValues[2], testValues[4]))
    
makePlots(outputData, 'boostingOutput.csv')

# SVM

In [None]:
def setupDataSVM(finalData, fraction):
    
    data, test = train_test_split(finalData, test_size = 0.2)
    data = data.sample(frac = fraction)
    
    train, validate = train_test_split(data, test_size = 0.2)
    
    
    features = ["Sex", "Age", "Height", "Weight", "Team", "Sport", "Event"]

    X_train = train[features]
    y_train = train['Medal']
    
    X_validate = validate[features]
    y_validate = validate['Medal']

    X_test = test[features]
    y_test = test["Medal"]
    return (X_train, y_train, X_validate, y_validate, X_test, y_test)

In [None]:
def trainandTestSVM(data, kernel, besthyperParameter, csvName):
    from sklearn.model_selection import KFold
    from sklearn.metrics import accuracy_score
    
    X_train, y_train, X_validate, y_validate, X_test, y_test = data
    

    hyperParameters = []
    
    hyperParameterDegree = range(1,11)
    hyperParameterGamma = [i * 0.1 for i in range(1,10)]
    hyperParameterTol = [i * 0.0004 for i in range (1, 10)]
    
    if kernel is 'poly':
        hyperParameters = list(itertools.product(['rbf'], hyperParameterGamma, hyperParameterTol))
        hyperParameters = random.sample(hyperParameters, 5)
    else:
        hyperParameters = list(itertools.product(['linear'], [3], ['auto'], hyperParameterTol))
        hyperParameters = random.sample(hyperParameters, 5)
    
    
    
    accuracies = {}
    
    if besthyperParameter is None:
        for hyperParameter in hyperParameters:
            c = svm.SVC(kernel = hyperParameter[0], degree = hyperParameter[1], gamma = hyperParameter[2], tol = hyperParameter[3])
            print('started Training')
            dt = c.fit(X_train, y_train)
            print('stopped training')
            y_pred = c.predict(X_validate)
            score = accuracy_score(y_validate, y_pred) *100
            accuracy = score
            print("for the hyperParameter %s, %f, %f, %f the accuracy is %f", hyperParameter[0], hyperParameter[1], hyperParameter[2], hyperParameter[3], accuracy)
            accuracies[hyperParameter] = accuracy
            

        print("finished \n")
        besthyperParameter = max(accuracies, key=accuracies.get)
        print("best hyper parameter: %s, %f, %f, %f" , hyperParameter[0], hyperParameter[1], hyperParameter[2], hyperParameter[3])
        storeHyperParameters(accuracies, csvName)
    
    print("Training Final SVM Learner")
    c = svm.SVC(kernel = besthyperParameter[0], degree = besthyperParameter[1], gamma = besthyperParameter[2], tol = besthyperParameter[3])
    
    start = time.time()
    dt = c.fit(X_train, y_train)
    end = time.time()
    trainTime = end - start
    
    start = time.time()
    y_pred = c.predict(X_test)
    end = time.time()
    testTime = end - start
    
    score = accuracy_score(y_test, y_pred) *100
    print("The Score is %f" % (score))
    
    y_train_pred = c.predict(X_train)
    trainAcc = accuracy_score(y_train, y_train_pred) *100
    
    return (score, trainTime, testTime, besthyperParameter, trainAcc)

In [None]:
finalData = importData('finalDataOlympics.csv')

#loop through different data sizes and get curve
fractions = [i*(1.0/8) for i in range(1, 9)]
fractions = list(reversed(fractions))

besthyperParameter = None

outputData = []

for i in fractions:
    print("current iteration fraction: %f" % (i))
    data = setupDataSVM(finalData, i)
    testValues = trainandTestSVM(data, 'linear', besthyperParameter, 'SVMLinearHP.csv')
    besthyperParameter = testValues[3]
    X_data = data[0]
    outputData.append((X_data.shape[0], testValues[0], testValues[1], testValues[2], testValues[4]))
    
makePlots(outputData, 'SVMLinearOutput.csv')

In [None]:
finalData = importData('finalDataOlympics.csv')

#loop through different data sizes and get curve
fractions = [i*(1.0/10) for i in range(1, 10)]
fractions = list(reversed(fractions))

besthyperParameter = None

outputData = []

for i in fractions:
    print("current iteration fraction: %f" % (i))
    data = setupDataSVM(finalData, i)
    testValues = trainandTestSVM(data, 'rbf', besthyperParameter, 'SVMRBFHP.csv')
    besthyperParameter = testValues[3]
    X_data = data[0]
    outputData.append((X_data.shape[0], testValues[0], testValues[1], testValues[2], testValues[4]))
    
makePlots(outputData, 'SVMRBFOutput.csv')

# NeuralNetwork

In [None]:
def setupDataNN(finalData, fraction):
    
    data, test = train_test_split(finalData, test_size = 0.2)
    data = data.sample(frac = fraction)
    
    train, validate = train_test_split(data, test_size = 0.2)
    
    
    features = ["Sex", "Age", "Height", "Weight", "Team", "Sport", "Event"]

    X_train = train[features]
    y_train = train['Medal']
    
    X_validate = validate[features]
    y_validate = validate['Medal']

    X_test = test[features]
    y_test = test["Medal"]
    return (X_train, y_train, X_validate, y_validate, X_test, y_test)

In [None]:
def setupModel():
    import keras
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import Dropout
    from keras import optimizers
    import random
    
    model = Sequential()
    model.add(Dense(55, input_dim = 7, activation = 'sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(55, activation = 'sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation = 'sigmoid'))
    model.summary()
    return model

In [None]:
def writeDictToCSV(dictionary, fileName):
    df = pd.DataFrame(list(dictionary.items()), columns = ['epochs', 'Accuracy'])
    df.to_csv(fileName)

In [None]:
def trainandTestNeuralNetwork(data, bestHyperParameter):
   
    import keras
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import Dropout
    from keras import optimizers
    import random


    X_train, y_train, X_validate, y_validate, X_test, y_test = data


    hyperParametersLR = [.0001, .003, .05, .8]
    hyperParametersM = [.2, .5, .8]
    hyperParameters = list(itertools.product(hyperParametersLR, hyperParametersM))

    accuracies = {}


    #one-hot encoding
    y_train = keras.utils.to_categorical(y_train, num_classes = 2)
    y_validate = keras.utils.to_categorical(y_validate, num_classes = 2)
    y_test = keras.utils.to_categorical(y_test, num_classes = 2)
    
    
    firstTime = False
    if bestHyperParameter is None:
        firstTime = True
    
    
    
    if bestHyperParameter is None:
        for hyperParameter in hyperParameters:
            model = setupModel()
            sgd = optimizers.SGD(lr = hyperParameter[0], decay = 1e-6, momentum = hyperParameter[1], nesterov=True)
            model.compile(loss = 'mean_squared_error', optimizer = sgd, metrics=["mean_squared_error", 'accuracy'])

            model.fit(X_train, y_train, epochs = 1500, batch_size = 128)

            score1, score2, acc = model.evaluate(X_validate, y_validate, batch_size = 128)
            


            print("for the hyperParameter %d, the accuracy is %f" , hyperParameter, score1, score2, acc)
            accuracies[hyperParameter] = acc

        print("finished \n")
        bestHyperParameter = max(accuracies, key=accuracies.get)
        print("best hyper parameter: %d" , bestHyperParameter[0], bestHyperParameter[1])
        storeHyperParameters(accuracies, 'NeuralNetHP.csv')

    model = setupModel()    
    sgd = optimizers.SGD(lr = bestHyperParameter[0], decay = 1e-6, momentum = bestHyperParameter[1], nesterov=True)
    
    model.compile(loss = 'mean_squared_error', optimizer = sgd, metrics=["mean_squared_error", 'accuracy'])
    
    start = time.time()
    if firstTime:
        learningCurveIterations = {}
        counter = 0;
        for i in range (1, 100):
            model.fit(X_train, y_train, epochs = 15, batch_size = 32)
            counter += 10
            end = time.time()
            score1,score2, acc = model.evaluate(X_test, y_test, batch_size = 32)
            scoreTrain1, scoreTrain2, accTrain = model.evaluate(X_train, y_train, batch_size = 32)
            learningCurveIterations[counter] = (acc, accTrain)
        writeDictToCSV(learningCurveIterations, 'nnlearningIterations.csv')
    else:
        model.fit(X_train, y_train, epochs = 1500, batch_size = 32)
        end = time.time()
    trainTime = end - start
        
    start = time.time()
    performance1, performance2, score = model.evaluate(X_test, y_test, batch_size = 32)
    end = time.time()
    testTime = end - start
    
    performance1, performance2, trainAcc = model.evaluate(X_train, y_train, batch_size = 32)

    print("The Score is %f" % (score))
    
    return (score, trainTime, testTime, bestHyperParameter, trainAcc)

In [None]:
finalData = importData('finalDataOlympics.csv')

#loop through different data sizes and get curve
fractions = [i*(1.0/10) for i in range(1, 9)]
fractions = list(reversed(fractions))

bestHyperParameter = None

outputData = []

for i in fractions:
    print("current iteration fraction: %f" % (i))
    data = setupDataNN(finalData, i)
    testValues = trainandTestNeuralNetwork(data, bestHyperParameter)
    bestHyperParameter = testValues[3]
    X_data = data[0]
    outputData.append((X_data.shape[0], testValues[0], testValues[1], testValues[2], testValues[4]))
    
makePlots(outputData, 'NeuralNet.csv')