In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import os
import random as rand
import sys
import collections
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
def readInCSV():
    train = pd.read_csv("../data/PCA_split_data_train.csv")
    col_names_tr = list(train.columns)
    row_tr, columns_tr = train.shape
    
    test = pd.read_csv("../data/PCA_split_data_test.csv")
    col_names_test = list(test.columns)
    row_test, columns_test = test.shape
 
    #split data into "labels" and predictors (The actual trainging set was split 70-30 since the testing set has no outcomes)
    X_tr = [] #predictors training
    y_tr = [] #predictions training
    X_test = [] #features testing
    y_test = [] #predicitions testing
    
    for index, row in train.iterrows():
        y_tr.append(list(row.ix[1:13]))
        X_tr.append(list(row.ix[13:]))
        
    for index, row in test.iterrows():
        y_test.append(list(row.ix[1:13]))
        X_test.append(list(row.ix[13:]))
        
    return X_tr,y_tr,X_test,y_test

In [None]:
X_tr,y_tr,X_test,y_test = readInCSV()

## Decision Tree Stuff

In [4]:
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_tr,y_tr)
y_pred = regressor.predict(X_test)
print(np.sqrt(mean_squared_log_error(y_test, y_pred,multioutput='raw_values')))
print(np.sqrt(mean_squared_log_error(y_test, y_pred)))
#cross_val_score(regressor, X, y, cv=10)

[1.88124979 1.85973342 1.75981817 1.62463089 2.30205666 2.82440022
 2.98257311 3.00946801 3.29549524 3.40252322 3.47189648 3.6014681 ]
2.7602162787727558


## Random Forest Stuff

In [7]:
RFregr = RandomForestRegressor(max_depth=10, random_state=0,n_estimators=1000)
RFregr.fit(X_tr,y_tr)
y_pred = RFregr.predict(X_test)
print(np.sqrt(mean_squared_log_error(y_test, y_pred,multioutput='raw_values')))
print(np.sqrt(mean_squared_log_error(y_test, y_pred)))

[1.58148569 1.68412589 1.5351987  1.43564457 1.74710074 1.96884323
 2.07025245 2.20303742 2.3352524  2.63922237 2.70155399 2.76537859]
2.1053092064943013


## SVM/SVR Stuff

In [7]:
clf = SVR(kernel="rbf",gamma='auto', C=1.0, epsilon=0.2)
multi_clf = MultiOutputRegressor(clf)
multi_clf.fit(X_tr,y_tr)
y_pred = multi_clf.predict(X_test)
print(np.sqrt(mean_squared_log_error(y_test, y_pred,multioutput='raw_values')))
print(np.sqrt(mean_squared_log_error(y_test, y_pred)))
#cross_val_score(clf, X, y_col_1, cv=10)

[1.34902169 1.41216253 1.37616898 1.31884526 1.70750222 1.95788325
 2.05074088 2.10753728 2.23462177 2.5031306  2.59302153 2.65897147]
1.9978418734749357


## MLP(NN) Stuff -- Note: np.abs() needs to be used on prediction because it guesses negative values sometimes... oh well

In [8]:
bpnn = MLPRegressor(max_iter = 50000) #Very basic BPNN/MLP
bpnn.fit(X_tr,y_tr)
y_pred = bpnn.predict(X_test)
print(np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred),multioutput='raw_values'))) #It's weird that it predicts negative vals.. oh well
print(np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred))))

[2.56396127 2.50369399 2.48554303 2.19349642 2.50821333 2.74924817
 2.60702612 2.80357631 2.77247622 2.95671674 2.98332751 3.06660366]
2.693680194902107


## kNN Stuff


In [9]:
neigh = KNeighborsRegressor(n_neighbors=5)
neigh.fit(X_tr,y_tr)
y_pred = neigh.predict(X_test)
print(np.sqrt(mean_squared_log_error(y_test, y_pred,multioutput='raw_values')))
print(np.sqrt(mean_squared_log_error(y_test, y_pred)))

[1.37359479 1.52616017 1.52944453 1.44291999 1.73108605 2.02886042
 2.04111615 2.23787946 2.30771045 2.65521729 2.65426601 2.69909518]
2.07425264644338


## Linear Regression -- Note: np.abs() needs to be used on prediction because it guesses negative values sometimes... oh well

In [11]:
reg = LinearRegression()
reg.fit(X_tr,y_tr)
y_pred = reg.predict(X_test)
print(np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred),multioutput='raw_values')))
print(np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred))))

[1.95769238 2.06549243 1.89289802 1.7190577  2.02606517 2.26727607
 2.3157198  2.47138278 2.63004808 2.81358877 2.86113952 3.01684548]
2.371246565495967
