In [83]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import os
import random as rand
import sys
import collections
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

In [47]:
def readInCSV():
    train = pd.read_csv("../data/PCA_split_data_train.csv")
    col_names_tr = list(train.columns)
    row_tr, columns_tr = train.shape
    
    test = pd.read_csv("../data/PCA_split_data_test.csv")
    col_names_test = list(test.columns)
    row_test, columns_test = test.shape
 
    #split data into "labels" and predictors (The actual trainging set was split 70-30 since the testing set has no outcomes)
    X_tr = [] #predictors training
    y_tr = [] #predictions training
    X_test = [] #features testing
    y_test = [] #predicitions testing
    
    for index, row in train.iterrows():
        y_tr.append(list(row.ix[1:13]))
        X_tr.append(list(row.ix[13:]))
        
    for index, row in test.iterrows():
        y_test.append(list(row.ix[1:13]))
        X_test.append(list(row.ix[13:]))
        
    return X_tr,y_tr,X_test,y_test

In [None]:
X_tr,y_tr,X_test,y_test = readInCSV()

## Decision Tree Stuff

In [57]:
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_tr,y_tr)
y_pred = regressor.predict(X_test)
print(mean_squared_log_error(y_test, y_pred,multioutput='raw_values'))
print(mean_squared_log_error(y_test, y_pred))
#cross_val_score(regressor, X, y, cv=10)

[ 3.53910076  3.45860838  3.09696     2.63942553  5.29946485  7.97723658
  8.89574233  9.05689771 10.86028886 11.57716424 12.05406518 12.97057244]
7.6187939056021206


## SVM/SVR Stuff

In [66]:
clf = SVR(kernel="rbf",gamma='auto', C=1.0, epsilon=0.2)
multi_clf = MultiOutputRegressor(clf)
multi_clf.fit(X_tr,y_tr)
y_pred = multi_clf.predict(X_test)
print(mean_squared_log_error(y_test, y_pred,multioutput='raw_values'))
print(mean_squared_log_error(y_test, y_pred))
#cross_val_score(clf, X, y_col_1, cv=10)

[1.81985953 1.99420302 1.89384106 1.73935281 2.91556383 3.83330683
 4.20553817 4.44171339 4.99353444 6.26566281 6.72376065 7.07012926]
3.9913721514098413


## MLP(NN) Stuff -- Note: np.abs() needs to be used on prediction because it guesses negative values sometimes... oh well

In [74]:
bpnn = MLPRegressor(max_iter = 50000) #Very basic BPNN/MLP
bpnn.fit(X_tr,y_tr)
y_pred = bpnn.predict(X_test)
print(mean_squared_log_error(y_test, np.abs(y_pred),multioutput='raw_values')) #It's weird that it predicts negative vals.. oh well
print(mean_squared_log_error(y_test, np.abs(y_pred)))

[6.19473084 6.32898738 5.58531851 4.69080572 6.1831937  7.41246978
 6.93758257 7.47236649 7.45918365 9.07976858 9.4174763  9.56094075]
7.1935686892313555


## kNN Stuff


In [82]:
neigh = KNeighborsRegressor(n_neighbors=5)
neigh.fit(X_tr,y_tr)
y_pred = neigh.predict(X_test)
print(mean_squared_log_error(y_test, y_pred,multioutput='raw_values'))
print(mean_squared_log_error(y_test, y_pred))

[1.88676265 2.32916487 2.33920056 2.08201809 2.9966589  4.1162746
 4.16615512 5.00810449 5.32552754 7.05017885 7.04512805 7.28511477]
4.3025240412773655


## Linear Regression -- Note: np.abs() needs to be used on prediction because it guesses negative values sometimes... oh well

In [87]:
reg = LinearRegression()
reg.fit(X_tr,y_tr)
y_pred = reg.predict(X_test)
print(mean_squared_log_error(y_test, np.abs(y_pred),multioutput='raw_values'))
print(mean_squared_log_error(y_test, np.abs(y_pred)))

[3.83255946 4.26625897 3.58306293 2.95515938 4.10494008 5.14054079
 5.36255821 6.10773284 6.9171529  7.91628175 8.18611934 9.10135665]
5.622810274376421
