In [1]:
import  os
import numpy as np
import pandas as pd
import scipy.stats
import sklearn
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

In [2]:
class DataModels:
    data=[]
    results=[]
    x_train=[]
    x_test=[]
    y_train=[]
    y_test=[]
    filepath='Regression_data'
    
    def __init__(self):
        filepath='Regression_data'
        load_data=np.genfromtxt(os.path.join(filepath, filename), delimiter=',' ,\
                             usecols=np.arange(unpredictable, columns), skip_header=skiprows)
        
        self.data=load_data[:,:-1]
        self.results = load_data[:,-1]
        
    def pre_process(self):
        
        #self.data=self.missing_values(self.data)
        np.random.seed(0)
        self.x_train, self.x_test,\
        self.y_train, self.y_test =sklearn.model_selection.train_test_split(self.data, self.results, test_size=0.3, \
                                                                                random_state=15)
        scaler=sklearn.preprocessing.StandardScaler().fit(self.x_train)
        self.x_train=scaler.transform(self.x_train)
        self.x_test=scaler.transform(self.x_test)
        
    def missing_values(self, data):
        imputer = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean')
        imputer.fit(data)
        return imputer.transform(data)
        
class Communities(DataModels):
    
    def __init__(self):
        
        load_data=np.genfromtxt(os.path.join(self.filepath, 'communities.data'), delimiter=',' ,\
                                usecols=np.arange(5, 128), skip_header=0)
        
        self.data=load_data[:,:-1]
        self.data=self.missing_values(self.data)
        self.results = load_data[:,-1]
        self.pre_process()
        
class Bike(DataModels):
    def __init__(self):
        
        load_data=np.loadtxt(os.path.join(self.filepath, 'hour.csv'), delimiter=',' ,\
                             usecols=np.arange(2, 17), skiprows=1)
        
        self.data=load_data[:,:-1]
        self.results = load_data[:,-1]
        self.pre_process()

class Facebook(DataModels):
    def __init__(self):
        
        def to_int(s):
            if s.decode('utf-8')=="Photo":
                return 0
            elif s.decode('utf-8')=="Status":
                return 1
            elif s.decode('utf-8')=="Link":
                return 2
            else:
                return 3
        
        load_data=np.genfromtxt(os.path.join(self.filepath, 'dataset_Facebook.csv'), delimiter=';' , converters={1:to_int},\
                                skip_header=1, dtype=np.int32 )
        

       
        self.data=load_data[:, :-1]
        self.results = load_data[:,-1]
        self.pre_process()
        
class Concrete(DataModels):
    def __init__(self):
        
        load_data=np.loadtxt(os.path.join(self.filepath, 'Concrete_data.csv'), delimiter=',' ,\
                                 skiprows=1)
        
        self.data=load_data[:,:-1]
        self.results = load_data[:,-1]
        self.pre_process()
        
class Student(DataModels):
    def __init__(self):
        
        load_data=np.loadtxt(os.path.join(self.filepath, 'student-por.csv'), delimiter=';' ,\
                                 usecols=(2, 6, 7,12, 13, 14, 23,24,25,26,27,28,29,30,31,32 ),\
                              skiprows=1, dtype=np.int32, converters={30: lambda x: int(x.decode('utf-8').strip("\"")),\
                                                                      31:lambda x: int(x.decode('utf-8').strip("\""))})
        
        self.data=load_data[: ,:-1]
        self.results = load_data[:,-1]
        self.pre_process()
        
class Sgemm(DataModels):
    def __init__(self):
        
        load_data=np.genfromtxt(os.path.join(self.filepath, 'sgemm_product.csv'), delimiter=',' ,\
                                 skip_header=1)
        
        self.data=load_data[:,:14]
        self.results = load_data[:,-1]
        self.pre_process()
        
class WineQuality(DataModels):
    def __init__(self):
        
        load_data1=np.loadtxt(os.path.join(self.filepath, 'winequality-red.csv'), delimiter=';' ,\
                                 skiprows=1)
        load_data2=np.loadtxt(os.path.join(self.filepath, 'winequality-white.csv'), delimiter=';' ,\
                                 skiprows=1)
        load_data=np.vstack((load_data1, load_data2))
        self.data=load_data[:,:-1]
        self.results = load_data[:,-1]
        self.pre_process()
        

        
class Qsar(DataModels):
    def __init__(self):
        
        load_data=np.loadtxt(os.path.join(self.filepath, 'qsar_aquatic_toxicity.csv'), delimiter=';' ,\
                                skiprows=0)
        
        self.data=load_data[:,:-1]
        self.results = load_data[:,-1]
        self.pre_process()

In [3]:
obj=[]
obj.append(Communities())
obj.append(Bike())
obj.append(Facebook())
obj.append(WineQuality())
obj.append(Sgemm())
obj.append(Qsar())
obj.append(Concrete())
obj.append(Student())

titles = ["Communities and Crime", "Facebook Metrics", "Bike Sharing", "Wine Quality", "SGEMM GPU Kernel Performance",\
          "QSAR aquatic toxicity", "Concrete Compressive Strength", "Student Performance"]

In [4]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import sklearn.metrics as sm

In [5]:
test_scores=[]
train_scores=[]
regressor_names = ["Linear Regression", "Support vector regression", "Decision tree regression", "Random forest regression",\
                  "k-nearest neighbours regression","AdaBoost regression","Gaussian process regression",\
                   "Neural network regression"]


def apply_regression(obj, train_score, test_score):
    regressors=[LinearRegression(), SVR(), DecisionTreeRegressor(), RandomForestRegressor(),KNeighborsRegressor(),\
                AdaBoostRegressor(), GaussianProcessRegressor(), MLPRegressor(max_iter=100) ]
    for i in range(len(regressors)):
        regressor=regressors[i];
        print(regressor_names[i])
        regressor.fit(obj.x_train, obj.y_train)
        y_predict=regressor.predict(obj.x_test)
        train_score.append(regressor.score(obj.x_train, obj.y_train))
        test_score.append(regressor.score(obj.x_test, obj.y_test))
        print(regressor.score(obj.x_train, obj.y_train))
        print(regressor.score(obj.x_test, obj.y_test))
        print("")


In [None]:

for i in range(len(obj)):
    train_scr=[]
    test_scr=[]
    
    print("Data Set: %s \n" % titles[i])
    apply_regression(obj[i], train_scr, test_scr)
    test_scores.append(test_scr)
    train_scores.append(train_scr)
    print("\n\n")

plot_results()
    

Data Set: Communities and Crime 

Linear Regression
0.7176516859671732
-13.567792057926543

Support vector regression
0.8466239502669322
0.5484174429852048

Decision tree regression
1.0
0.30563524577195655

Random forest regression
0.9501177238391615
0.6426970230370699

k-nearest neighbours regression
0.726298259300233
0.5562014281173444

AdaBoost regression
0.60915152653999
0.4639573471114167

Gaussian process regression
1.0
-1.033748403173643

Neural network regression
0.9323728888329681
0.1138597442663759




Data Set: Facebook Metrics 

Linear Regression
1.0
1.0

Support vector regression
0.8947240682999966
0.8973617231584267

Decision tree regression
1.0
0.9991445023238076

Random forest regression
0.9999624005219961
0.9998356422983286

k-nearest neighbours regression
0.9748460333220453
0.9587817287026642

AdaBoost regression
0.980307068900035
0.9796319707954219

Gaussian process regression
1.0
0.9510164414767213

Neural network regression




0.9995061697256841
0.9993952254325315




Data Set: Bike Sharing 

Linear Regression
0.999999965728199
0.9999998638103724

Support vector regression
-0.028686255681363404
-0.024394544066142387

Decision tree regression
1.0
0.5127227311312624

Random forest regression
0.9980636507253364
0.5510224503958738

k-nearest neighbours regression
0.9278751408481243
0.5076142141260527

AdaBoost regression
0.9866994749668593
0.5385456877714837

Gaussian process regression
1.0
-0.11781055431065801

Neural network regression




-0.2678216553751014
0.011382825031927335




Data Set: Wine Quality 

Linear Regression
0.285690484842979
0.3049950147008007

Support vector regression
0.47194341083283775
0.4069542686220885

Decision tree regression
1.0
0.036161659585560946

Random forest regression
0.9307014240973198
0.5099763556620643

k-nearest neighbours regression
0.5678927080658398
0.35397448972591783

AdaBoost regression
0.3334060621926691
0.31184576947607934

Gaussian process regression
1.0
-1.076196330707333

Neural network regression




0.40220058168262296
0.37365930247122126




Data Set: SGEMM GPU Kernel Performance 

Linear Regression
0.4066694686311795
0.4090655049007984

Support vector regression


In [None]:
def plot_results:
    for i in range(len(obj)):
    fig, ax = plt.subplots()
    ax.title(titles[i])
    ax.scatter(regressor_names, train_scores[i], label='Training Score', color='red', marker='o')
    ax.scatter(regressor_names, test_scores[i], label='Testing Score', color='black', marker='o')
    ax.legend()

In [None]:
plot_results() #for direct_fit regressors

In [7]:
from sklearn.gaussian_process.kernels import RBF

#params for grid search and random search cv

params = []
params.append({})
params.append({'C':1, 'kernel':'rbf', 'gamma':'auto', 'epsilon':0.1})
params.append({'max_depth':(None,), 'min_samples_leaf':(1,)})
params.append({'n_estimators':(10,), 'max_depth':(None,)})
params.append({})
params.append({'n_estimators':(10,),'learning_rate':(1.,)})
params.append({'kernel':(1.0* RBF(1.0)), 'alpha':(1e-10,), 'normalize_y':(True,)})
params.append({'early_stopping':(False,), 'hidden_layer_sizes':(100,), 'activation':('relu',), 'batch_size':('auto',),\
              'max_iter':200})


In [9]:
def apply_regression_rs(obj, train_score, test_score):
    regressors=[LinearRegression(), SVR(), DecisionTreeRegressor(), RandomForestRegressor(),KNeighborsRegressor(),\
                AdaBoostRegressor(), GaussianProcessRegressor(), MLPRegressor(max_iter=100) ]
    for i in range(len(regressors)):
        
        regressor=RandomizedSearchCV(estimators=regressors[i], param_distributions=params[i], cv=3, random_state=0);
        print(regressor_names[i])
        regressor.fit(obj.x_train, obj.y_train)
        y_predict=regressor.predict(obj.x_test)
        train_score.append(regressor.score(obj.x_train, obj.y_train))
        test_score.append(regressor.score(obj.x_test, obj.y_test))
        print(regressor.score(obj.x_train, obj.y_train))
        print(regressor.score(obj.x_test, obj.y_test))
        print("")


In [None]:
for i in range(len(obj)):
    train_scr=[]
    test_scr=[]
    
    print("Data Set: %s \n" % titles[i])
    apply_regression(obj[i], train_scr, test_scr)
    test_scores.append(test_scr)
    train_scores.append(train_scr)
    print("\n\n")


Data Set: Communities and Crime 

Linear Regression
0.7176516859671732
-13.567792057926543

Support vector regression
0.8466239502669322
0.5484174429852048

Decision tree regression
1.0
0.2580374344264894

Random forest regression
0.9501469242458166
0.6442525698258561

k-nearest neighbours regression
0.726298259300233
0.5562014281173444

AdaBoost regression
0.6199268672754379
0.4804591952411349

Gaussian process regression
1.0
-1.033748403173643

Neural network regression
0.937874366889564
0.11135161491388346




Data Set: Facebook Metrics 

Linear Regression
1.0
1.0

Support vector regression
0.8947240682999966
0.8973617231584267

Decision tree regression
1.0
0.9991196797144126

Random forest regression
0.9999577244284972
0.9998358714399964

k-nearest neighbours regression
0.9748460333220453
0.9587817287026642

AdaBoost regression
0.9804668600006925
0.9796728541713506

Gaussian process regression
1.0
0.9510164414767213

Neural network regression




0.9995709454726293
0.9994573275047457




Data Set: Bike Sharing 

Linear Regression
0.999999965728199
0.9999998638103724

Support vector regression
-0.028686255681363404
-0.024394544066142387

Decision tree regression
1.0
0.5934997741835728

Random forest regression
0.998185930302215
0.5542348525193883

k-nearest neighbours regression
0.9278751408481243
0.5076142141260527

AdaBoost regression
0.9905312273110678
0.5622754941323203

Gaussian process regression
1.0
-0.11781055431065801

Neural network regression




-0.21815199995795975
0.04929407424836274




Data Set: Wine Quality 

Linear Regression
0.285690484842979
0.3049950147008007

Support vector regression
0.47194341083283775
0.4069542686220885

Decision tree regression
1.0
0.040963701139227204

Random forest regression
0.9314858899475336
0.5086090086299547

k-nearest neighbours regression
0.5678927080658398
0.35397448972591783

AdaBoost regression
0.336072112232109
0.32409993457286557

Gaussian process regression
1.0
-1.076196330707333

Neural network regression




0.4135472906440293
0.3854295388229365




Data Set: SGEMM GPU Kernel Performance 

Linear Regression
0.4066694686311795
0.4090655049007984

Support vector regression


In [77]:
plot_results() #for randomizedSearch cv regressors

In [78]:
def apply_regression_gs(obj, train_score, test_score):
    regressors=[LinearRegression(), SVR(), DecisionTreeRegressor(), RandomForestRegressor(),KNeighborsRegressor(),\
                AdaBoostRegressor(), GaussianProcessRegressor(), MLPRegressor(max_iter=100) ]
    for i in range(len(regressors)):
        
        regressor=GridSearchCV(estimators=regressors[i], param_distributions=params[i], cv=3, random_state=0);
        print(regressor_names[i])
        regressor.fit(obj.x_train, obj.y_train)
        y_predict=regressor.predict(obj.x_test)
        train_score.append(regressor.score(obj.x_train, obj.y_train))
        test_score.append(regressor.score(obj.x_test, obj.y_test))
        print(regressor.score(obj.x_train, obj.y_train))
        print(regressor.score(obj.x_test, obj.y_test))
        print("")


In [None]:
plot_results() #for gridSearchCV regressors