In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, BayesianRidge, PassiveAggressiveRegressor, SGDRegressor
from sklearn import svm
from sklearn import tree
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression

In [None]:
dataset = pd.read_csv('dataset.csv')

In [None]:
dataset.dropna(inplace=True)
dataset.head()

In [None]:
dataset.describe()

In [None]:
def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true: list, predicted: list):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true: list, predicted: list):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return [mae, mse, rmse, r2_square]

def create_folder(directory: str):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' +  directory)

def clean_folder(directory: str):
    try:
        if os.path.exists(directory):
            for file in os.listdir(directory):
                file_path = os.path.join(directory, file)
                try:
                    if os.path.isfile(file_path):
                        os.unlink(file_path)
                except Exception as e:
                    print(e)
    except OSError:
        print('Error: Cleaning directory. ' +  directory)

In [None]:
dataset.info()

font = {'size': 13.5}

plt.rc('font', **font)
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams["font.weight"] = "bold"

sm = pd.plotting.scatter_matrix(dataset, alpha=0.5, figsize=(16, 16), diagonal='kde')

clean_folder('figures')
create_folder('figures')

for ax in sm.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')

plt.tight_layout()
plt.gcf().subplots_adjust(wspace=0, hspace=0)
plt.savefig('figures/scatter_matrix.png')


In [None]:
dataset.columns

In [None]:
regressors = {
    "Linear Regression": LinearRegression(),
    "Bayesian Ridge Regression": BayesianRidge(),
    "AdaBoost Regression": AdaBoostRegressor(random_state=0, n_estimators=3000),
    "Random Forest Regression": RandomForestRegressor(random_state=0, n_estimators=3000),
    "Passive Agressive Regression": PassiveAggressiveRegressor(),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=0, n_estimators=1500),
    "SGD Regression": SGDRegressor(max_iter=15000, tol=1e-3),
    "Artificial Neural Network Regression": MLPRegressor(max_iter=5000000, hidden_layer_sizes=(50, 50), learning_rate_init=0.01),
    "Support Vector Regression": svm.SVR(kernel='rbf', C=1e3, gamma=0.1),
    "Decision Tree Regression": tree.DecisionTreeRegressor(random_state=0, max_depth=1000),
    "Extra Tree Regression": tree.ExtraTreeRegressor(random_state=0, max_depth=1000),
    "PLS Regression": PLSRegression(n_components=3),
    "KNN Regression": KNeighborsRegressor(n_neighbors=3)
}

In [None]:
result_dict = {}

X = dataset[['input column name 1',
             'input column name 2',
             'input column name 3',
             'input column name 4',
             'input column name 5']]

y = dataset['output column name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4777)
pipeline = Pipeline([('std_scalar', MinMaxScaler())])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

for j in regressors:
    regr = regressors[j].fit(X_train, y_train)
    score = regr.score(X_test, y_test)
    pred = regr.predict(X_test)
    plt.figure(figsize=(10,10))
    plt.scatter(y_test, pred, c='crimson', s=20, alpha=0.5)
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(pred), max(y_test))
    p2 = min(min(pred), min(y_test))
    plt.plot([p1, p2], [p1, p2], 'k--', lw=2)
    plt.title("Prediction using " + j , fontsize=20, fontweight="bold")
    plt.xlabel('True Values', fontsize=20, fontweight="bold")
    plt.ylabel('Predictions', fontsize=20, fontweight="bold")
    plt.axis('equal')
    plt.tight_layout()
    plt.savefig("figures/" + j + '_Result.png')
    results = evaluate(y_test, pred)
    result_dict[j]  = results
    
results =  pd.DataFrame.from_dict(result_dict, orient='index', columns=['MAE', 'MSE', 'RMSE', 'R2 Square'])   

In [None]:
results