In [1]:
#inputs needed
file_2019 = "NCAAF Teams 2019 - Nikki edits 20200722.csv"
file_2018 = "NCAAF Team Leaders_2018 - Nikki edits 20200722.csv"

In [2]:
#all packages used for code
import os
import pandas as pd
import sklearn as sklearn
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [3]:
#retrieve file
def get_file(input_file):
    current_directory = os.getcwd()
#    print(current_directory)
#    print(input_file)
    file_path = current_directory + "/" + input_file
#    print(file_path)
    file_df = pd.read_csv(file_path)
#    print(file_df.shape)
#    print(file_df.head)
        
    return(file_df)

In [4]:
#SVM code block
#reference code/example: https://towardsdatascience.com/support-vector-machine-python-example-d67d9b63f1c8
def SVM_model(x_train, x_test, y_train, y_test, plot_y):

    best_c = []
    
    #if we need to plot the results, use only the first 2 fields
    if plot_y: 
        x_train = x_train.iloc[:, 0:2]
        x_test = x_test.iloc[:, 0:2]
#        print(x_train.shape)
#        print(x_test.shape)
    
    cs = [0.1, 1, 10, 100, 1000]

    for c in cs:
        svc = LinearSVC(dual = False, C=c)
        svc.fit(x_train, y_train)
        y_test_pred = svc.predict(x_test)
        best_c.append(1-((y_test != y_test_pred).sum()/y_test.shape[0]))

    print("Accuracy rate for SVM:", max(best_c), "with C of", cs[(best_c.index(max(best_c)))])
    
    if plot_y: 
        #rerun to get appropriate c value
        svc = LinearSVC(dual = False, C=cs[(best_c.index(max(best_c)))])
        svc.fit(x_train, y_train)
        y_test_pred = svc.predict(x_test)
        plt.scatter(x_train.iloc[:, 0], x_train.iloc[:, 1], c=y_train, cmap='winter');
        ax = plt.gca()
        xlim = ax.get_xlim()
        w = svc.coef_[0]
        a = -w[0] / w[1]
        xx = np.linspace(xlim[0], xlim[1])
        yy = a * xx - svc.intercept_[0] / w[1]
        plt.plot(xx, yy)
        yy = a * xx - (svc.intercept_[0] - 1) / w[1]
        plt.plot(xx, yy, 'k--')
        yy = a * xx - (svc.intercept_[0] + 1) / w[1]
        plt.plot(xx, yy, 'k--')  

In [5]:
#neural network code block
#reference code/example: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
def neural_net_model(x_train, x_test, y_train, y_test, plot_y):

    best_LRI = []
        
    #if we need to plot the results, use only the first 2 fields
    if plot_y: 
        x_train = x_train.iloc[:, 0:2]
        x_test = x_test.iloc[:, 0:2]
#        print(x_train.shape)
#        print(x_test.shape)

    #tune the step size
    for LRI in range(1,1000):
#        print(LRI)
        nn_model = MLPClassifier(hidden_layer_sizes=(5, 2), max_iter=1000, random_state=3, learning_rate_init = LRI/1000).fit(x_train, y_train)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
        y_test_pred = nn_model.predict(x_test)
        best_LRI.append(1-((y_test != y_test_pred).sum()/y_test.shape[0]))
    
    print("Accuracy rate for neural network: ", max(best_LRI))
    print("The learning rate associated with this is: ", (best_LRI.index(max(best_LRI))+1)/1000)

    if plot_y:    
        #rerun to get appropriate neural network
        nn_model = MLPClassifier(hidden_layer_sizes=(5, 2), max_iter=1000, random_state=3, learning_rate_init = (best_LRI.index(max(best_LRI))+1)/1000).fit(x_train, y_train)
        y_test_pred = nn_model.predict(x_test)
        
        h = .02  
        cmap_light = ListedColormap(['orange', 'cyan'])
        cmap_bold = ListedColormap(['darkorange', 'c'])

        x_min, x_max = x_train.iloc[:, 0].min() - 1, x_train.iloc[:, 0].max() + 1
        y_min, y_max = x_train.iloc[:, 1].min() - 1, x_train.iloc[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
        
        Z = nn_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])

        Z = Z[:, 1].reshape(xx.shape)
        plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    
        plt.scatter(x_train.iloc[:, 0], x_train.iloc[:, 1], c=y_train, cmap=cmap_bold,
                edgecolor='k', s=20)
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xlabel("Attribute 1")
        plt.ylabel("Attribute 2")
        plt.title("Neural Network Classification")

        plt.show()    

In [6]:
#Naive Bayes modeling - https://scikit-learn.org/stable/modules/naive_bayes.html
def NB_model(x_train, x_test, y_train, y_test, plot_y):
    
    #if we need to plot the results, use only the first 2 fields
    if plot_y: 
        x_train = x_train.iloc[:, 0:2]
        x_test = x_test.iloc[:, 0:2]
#        print(x_train.shape)
#        print(x_test.shape)
    
    naive_bayes = GaussianNB()
    y_test_pred = naive_bayes.fit(x_train, y_train).predict(x_test)
    print("Accuracy rate for naive Bayes modeling: ", 1-((y_test != y_test_pred).sum()/y_test.shape[0]))
    
    if plot_y:    
        h = .02  
        cmap_light = ListedColormap(['orange', 'cyan'])
        cmap_bold = ListedColormap(['darkorange', 'c'])

        x_min, x_max = x_train.iloc[:, 0].min() - 1, x_train.iloc[:, 0].max() + 1
        y_min, y_max = x_train.iloc[:, 1].min() - 1, x_train.iloc[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
        
        Z = naive_bayes.predict_proba(np.c_[xx.ravel(), yy.ravel()])

        Z = Z[:, 1].reshape(xx.shape)
        plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    
        plt.scatter(x_train.iloc[:, 0], x_train.iloc[:, 1], c=y_train, cmap=cmap_bold,
                edgecolor='k', s=20)
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xlabel("Attribute 1")
        plt.ylabel("Attribute 2")
        plt.title("Naive Bayes Classification")

        plt.show()    

In [7]:
#logistic regression modeling - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
def log_reg_model(x_train, x_test, y_train, y_test, plot_y): 

    #if we need to plot the results, use only the first 2 fields
    if plot_y: 
        x_train = x_train.iloc[:, 0:2]
        x_test = x_test.iloc[:, 0:2]
#        print(x_train.shape)
#        print(x_test.shape)

    log_reg = LogisticRegression()
    log_reg.fit(x_train, y_train)
    y_test_pred = log_reg.predict(x_test)
    print("Accuracy rate for logistic regression modeling: ", 1-((y_test != y_test_pred).sum()/y_test.shape[0]))
    
    if plot_y:    
        h = .02  
        cmap_light = ListedColormap(['orange', 'cyan'])
        cmap_bold = ListedColormap(['darkorange', 'c'])

        x_min, x_max = x_train.iloc[:, 0].min() - 1, x_train.iloc[:, 0].max() + 1
        y_min, y_max = x_train.iloc[:, 1].min() - 1, x_train.iloc[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
        
        Z = log_reg.predict_proba(np.c_[xx.ravel(), yy.ravel()])

        Z = Z[:, 1].reshape(xx.shape)
        plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    
        plt.scatter(x_train.iloc[:, 0], x_train.iloc[:, 1], c=y_train, cmap=cmap_bold,
                edgecolor='k', s=20)
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xlabel("Attribute 1")
        plt.ylabel("Attribute 2")
        plt.title("Logistic Regression Classification")

        plt.show()    

In [8]:
#KNN model - https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# plot example - https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
def KNN_model(x_train, x_test, y_train, y_test, plot_y): 

    best_k = []
    
    #if we need to plot the results, use only the first 2 fields
    if plot_y: 
        x_train = x_train.iloc[:, 0:2]
        x_test = x_test.iloc[:, 0:2]
#        print(x_train.shape)
#        print(x_test.shape)
    
    #find best K on the training dataset; limit to max of k=100 for reasonable run time
    for k in range(1,min(x_train.shape[0], 100)):
#        print(k)
        knn = KNeighborsClassifier(n_neighbors = k)
        knn.fit(x_train,y_train)
        y_test_pred = knn.predict(x_test)
        best_k.append(1-((y_test != y_test_pred).sum()/y_test.shape[0]))
#        print(best_k)

    print("Accuracy rate for the KNN model: ", max(best_k))
    print("The K associated with this is: ", best_k.index(max(best_k))+1)
      
    if plot_y:    
        h = .02  
        cmap_light = ListedColormap(['orange', 'cyan'])
        cmap_bold = ListedColormap(['darkorange', 'c'])

        x_min, x_max = x_train.iloc[:, 0].min() - 1, x_train.iloc[:, 0].max() + 1
        y_min, y_max = x_train.iloc[:, 1].min() - 1, x_train.iloc[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
        
        #rerun KNN because it will be on the k = x_train.shape[0] model from above
        knn = KNeighborsClassifier(n_neighbors = (best_k.index(max(best_k))+1))
        knn.fit(x_train,y_train)
        Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

        Z = Z.reshape(xx.shape)
        plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    
        plt.scatter(x_train.iloc[:, 0], x_train.iloc[:, 1], c=y_train, cmap=cmap_bold,
                edgecolor='k', s=20)
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xlabel("Attribute 1")
        plt.ylabel("Attribute 2")
        plt.title("KNN Classification (k = %i)" % (best_k.index(max(best_k))+1))

        plt.show()

In [20]:
#MAIN CODE BLOCK 

#import file and convert to Pandas dataframe
train_file = get_file(file_2019)
train_file = train_file.drop(['team'], axis=1)
#print(train_file.shape)
#print(train_file.head)
test_file = get_file(file_2018)
test_file = test_file.drop(['team'], axis=1)
#test_file
#print(test_file.shape)
#print(test_file.head)

In [21]:
#run SVM
SVM_model(train_file.iloc[:, :-1], test_file.iloc[:, :-1], train_file.iloc[:,-1], test_file.iloc[:,-1], 0)

Accuracy rate for SVM: 0.7615384615384615 with C of 0.1


In [22]:
#run neural network
neural_net_model(train_file.iloc[:, :-1], test_file.iloc[:, :-1], train_file.iloc[:,-1], test_file.iloc[:,-1], 0)

Accuracy rate for neural network:  0.8076923076923077
The learning rate associated with this is:  0.001


In [23]:
#run Naive Bayes model 
NB_model(train_file.iloc[:, :-1], test_file.iloc[:, :-1], train_file.iloc[:,-1], test_file.iloc[:,-1], 0)

Accuracy rate for naive Bayes modeling:  0.7538461538461538


In [24]:
#run logistic regression model 
log_reg_model(train_file.iloc[:, :-1], test_file.iloc[:, :-1], train_file.iloc[:,-1], test_file.iloc[:,-1], 0)

Accuracy rate for logistic regression modeling:  0.8307692307692307


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [25]:
#run k-nearest neighbors model 
KNN_model(train_file.iloc[:, :-1], test_file.iloc[:, :-1], train_file.iloc[:,-1], test_file.iloc[:,-1], 0)

Accuracy rate for the KNN model:  0.8153846153846154
The K associated with this is:  4


In [None]:
#using 2019 as training and 2018 as test
#Accuracy rate for SVM: 0.7769230769230769 with C of 0.1
#Accuracy rate for neural network:  0.8076923076923077
#The learning rate associated with this is:  0.001
#Accuracy rate for naive Bayes modeling:  0.6692307692307693
#Accuracy rate for logistic regression modeling:  0.7923076923076923
#Accuracy rate for the KNN model:  0.8153846153846154
#The K associated with this is:  3

In [26]:
#run all again, reversing the test/train samples
SVM_model(test_file.iloc[:, :-1], train_file.iloc[:, :-1], test_file.iloc[:,-1], train_file.iloc[:,-1], 0)
neural_net_model(test_file.iloc[:, :-1], train_file.iloc[:, :-1], test_file.iloc[:,-1], train_file.iloc[:,-1], 0)
NB_model(test_file.iloc[:, :-1], train_file.iloc[:, :-1], test_file.iloc[:,-1], train_file.iloc[:,-1], 0)
log_reg_model(test_file.iloc[:, :-1], train_file.iloc[:, :-1], test_file.iloc[:,-1], train_file.iloc[:,-1], 0)
KNN_model(test_file.iloc[:, :-1], train_file.iloc[:, :-1], test_file.iloc[:,-1], train_file.iloc[:,-1], 0)

Accuracy rate for SVM: 0.6793893129770993 with C of 10
Accuracy rate for neural network:  0.8091603053435115
The learning rate associated with this is:  0.001
Accuracy rate for naive Bayes modeling:  0.7786259541984732
Accuracy rate for logistic regression modeling:  0.7022900763358779


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy rate for the KNN model:  0.8244274809160306
The K associated with this is:  6


In [None]:
#using 2018 as training and 2019 as test
#Accuracy rate for SVM: 0.8091603053435115 with C of 0.1
#Accuracy rate for neural network:  0.816793893129771
#The learning rate associated with this is:  0.687
#Accuracy rate for naive Bayes modeling:  0.7175572519083969
#Accuracy rate for logistic regression modeling:  0.83206106870229
#Accuracy rate for the KNN model:  0.8091603053435115
#The K associated with this is:  10