In [1]:
import pandas as pd 
import numpy as np
import math
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# read a remote .csv file
df = pd.read_csv('df_regress2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Year_Encoded                  650 non-null    float64
 1   Month_Encoded                 650 non-null    float64
 2   Quarter_Encoded               650 non-null    float64
 3   Timelag(Years)_disc_Encoded   650 non-null    float64
 4   DataClassesCount_str_Encoded  650 non-null    float64
 5   Length_disc_Encoded           650 non-null    float64
 6   IsVerified_Encoded            650 non-null    float64
 7   IsFabricated_Encoded          650 non-null    float64
 8   IsSensitive_Encoded           650 non-null    float64
 9   IsRetired_Encoded             650 non-null    float64
 10  IsSpamList_Encoded            650 non-null    float64
 11  IsMalware_Encoded             650 non-null    float64
 12  Vader_Sentiment_Encoded       650 non-null    float64
 13  Passw

In [2]:
df.head()

Unnamed: 0,Year_Encoded,Month_Encoded,Quarter_Encoded,Timelag(Years)_disc_Encoded,DataClassesCount_str_Encoded,Length_disc_Encoded,IsVerified_Encoded,IsFabricated_Encoded,IsSensitive_Encoded,IsRetired_Encoded,IsSpamList_Encoded,IsMalware_Encoded,Vader_Sentiment_Encoded,PasswordStored_Encoded,Description_Cluster_Encoded,PwnCount
0,0.113846,0.072308,0.283077,0.623077,0.223077,0.324615,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.533846,0.301538,14936670
1,0.127692,0.072308,0.283077,0.623077,0.083077,0.332308,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.287692,0.238462,8661578
2,0.027692,0.127692,0.283077,0.376923,0.106154,0.343077,0.056923,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.533846,0.301538,6414191
3,0.146154,0.056923,0.229231,0.623077,0.14,0.324615,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.050769,0.287692,0.301538,4009640
4,0.026154,0.098462,0.24,0.376923,0.135385,0.343077,0.056923,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.287692,0.135385,7485802


In [3]:
df.tail()

Unnamed: 0,Year_Encoded,Month_Encoded,Quarter_Encoded,Timelag(Years)_disc_Encoded,DataClassesCount_str_Encoded,Length_disc_Encoded,IsVerified_Encoded,IsFabricated_Encoded,IsSensitive_Encoded,IsRetired_Encoded,IsSpamList_Encoded,IsMalware_Encoded,Vader_Sentiment_Encoded,PasswordStored_Encoded,Description_Cluster_Encoded,PwnCount
645,0.004615,0.127692,0.283077,0.623077,0.14,0.343077,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.533846,0.301538,756737
646,0.098462,0.069231,0.247692,0.623077,0.223077,0.324615,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.067692,0.238462,172869660
647,0.146154,0.078462,0.247692,0.623077,0.135385,0.324615,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.287692,0.129231,4946850
648,0.113846,0.084615,0.229231,0.623077,0.083077,0.332308,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.533846,0.195385,3474763
649,0.127692,0.072308,0.283077,0.623077,0.223077,0.332308,0.943077,0.995385,0.916923,0.998462,0.983077,0.993846,0.806154,0.533846,0.238462,1298651


### Standardisation

In [4]:
# Separate explanatory variables (x) from the response variable (y)
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# Split dataset into 60% training and 40% test sets 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# apply standardisation to explanatory variables
# Build scaler on the train set
std_scaler = preprocessing.StandardScaler().fit(X_train)

# Standardise the train set
X_train_std = std_scaler.transform(X_train)

# Standardise the test set
X_test_std = std_scaler.transform(X_test)

### PCA with standardisation

In [5]:
# fit a 2-component PCA on standardised training set
pca_std = PCA(n_components=2)
pca_std.fit(X_train_std)

# reduce dimensionality of standardised training and test sets
X_train_pca_std = pca_std.transform(X_train_std)
X_test_pca_std = pca_std.transform(X_test_std)

### Build a linear regression model

In [6]:
# Compare the dimensionality of the original data vs. its dimensionality reduced version
print("Dimension of original data:", X_train.shape)
print("Dimension of PCA-reduced data:", X_train_pca_std.shape)
print("\n")

# Build a linear regression model
print("Build Linear Regression Model")
model = LinearRegression()

# Train (fit) the linear regression model using the training set
model.fit(X_train_pca_std, y_train)

# Print the intercept and coefficient learned by the linear regression model
print("Intercept: ", model.intercept_)
print("Coefficient: ", model.coef_)

# Use linear regression to predict the values of (y) in the test set
# based on the values of x in the test set
y_pred = model.predict(X_test_pca_std)

# Compute standard performance metrics of the linear regression:

# Mean Absolute Error
mae = metrics.mean_absolute_error(y_test, y_pred)
# Mean Squared Error
mse = metrics.mean_squared_error(y_test, y_pred)
# Root Mean Square Error
rmse =  math.sqrt(metrics.mean_squared_error(y_test, y_pred))
# Normalised Root Mean Square Error
y_max = y_test.max()
y_min = y_test.min()
rmse_norm = rmse / (y_max - y_min)
# R-Squared
r_2 = metrics.r2_score(y_test, y_pred)

print("MLP performance:")
print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("RMSE (Normalised): ", rmse_norm)
print("R^2: ", r_2)

Dimension of original data: (455, 15)
Dimension of PCA-reduced data: (455, 2)


Build Linear Regression Model
Intercept:  21199885.797802195
Coefficient:  [11492906.74831821 -3929975.90425385]
MLP performance:
MAE:  24144879.194677226
MSE:  2581915508240376.0
RMSE:  50812552.66408465
RMSE (Normalised):  0.09973851736118655
R^2:  0.040906697283606186


### SVR without PCA for regression

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# separate explanatory variables (x) from the response variable (y)
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# split dataset into 60% training and 40% test sets 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# create SVR with standardisation via standardisation --> classification pipeline
pipe = make_pipeline(StandardScaler(), SVR(kernel="linear"))
pipe.fit(X_train, y_train)
r2_std = pipe.score(X_test, y_test)

# print performance
print("SVR performance:")
print("R^2: ", r2_std)

SVR performance:
R^2:  -0.06429481408505255


### SVR with PCA for regression

In [8]:
from sklearn.svm import SVR

svr = SVR(kernel='linear')  

# train the model using the training set
svr.fit(X_train_pca_std, y_train)

# predict the classes in the test set
y_hat = svr.predict(X_test_pca_std)

# Mean Absolute Error
mae = metrics.mean_absolute_error(y_test, y_hat)
# Mean Squared Error
mse = metrics.mean_squared_error(y_test, y_hat)
# Root Mean Square Error
rmse =  math.sqrt(metrics.mean_squared_error(y_test, y_hat))

# R-Squared
r_2 = metrics.r2_score(y_test, y_hat)


# print performance
print("SVR performance:")
print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R^2: ", r_2)

SVR performance:
MAE:  14039082.819372779
MSE:  2865124255320182.0
RMSE:  53526855.458920635
R^2:  -0.06429566574029444


### Random Forest Regressor without PCA

In [9]:
from sklearn.ensemble import RandomForestRegressor

# separate explanatory variables (x) from the response variable (y)
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


# build, apply and evaluate the baseline predictor using the existing avg. temp. values
# y_base = X_test[:, df.columns.get_loc("average")]
# mae = metrics.mean_absolute_error(y_test, y_base)
# mse = metrics.mean_squared_error(y_test, y_base)
# rmse =  math.sqrt(mse)
# print("---Baseline---")
# print("MAE: %.2f " % mae)
# print("MSE: %.2f " % mse)
# print("RMSE: %.2f " % rmse)


# build, apply and evaluate RF Regressor 
rf = RandomForestRegressor(n_estimators=1000, random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse =  math.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Normalised Root Mean Square Error
y_max = y_test.max()
y_min = y_test.min()
rmse_norm = rmse / (y_max - y_min)
# R-Squared
r_2 = metrics.r2_score(y_test, y_pred)

print("---Random Forest Regressor---")
print("MAE: %.2f " % mae)
print("MSE: %.2f " % mse)
print("RMSE: %.2f " % rmse)
print("RMSE (Normalised): ", rmse_norm)
print("R^2: ", r_2)

---Random Forest Regressor---
MAE: 29825159.05 
MSE: 4789963123572678.00 
RMSE: 69209559.48 
RMSE (Normalised):  0.13584947986185855
R^2:  -0.7793074705252325


### Random Forest Regressor with PCA

In [10]:
from sklearn.ensemble import RandomForestRegressor

# build, apply and evaluate RF Regressor 
rf = RandomForestRegressor(n_estimators=1000, random_state=0)
rf.fit(X_train_pca_std, y_train)
y_pred = rf.predict(X_test_pca_std)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse =  math.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Normalised Root Mean Square Error
y_max = y_test.max()
y_min = y_test.min()
rmse_norm = rmse / (y_max - y_min)
# R-Squared
r_2 = metrics.r2_score(y_test, y_pred)

print("---Random Forest Regressor---")
print("MAE: %.2f " % mae)
print("MSE: %.2f " % mse)
print("RMSE: %.2f " % rmse)
print("RMSE (Normalised): ", rmse_norm)
print("R^2: ", r_2)

---Random Forest Regressor---
MAE: 25983319.45 
MSE: 3519239203137786.00 
RMSE: 59323175.94 
RMSE (Normalised):  0.1164437782162259
R^2:  -0.30727699632849137


### Random Forest Regressor vs. Linear Regression - Optimisation

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.svm import SVR

# separate explanatory variables (x) from the response variable (y)
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


# specify a list of estimators to investigate
estimators = [
    ("Random Forest Regressor", RandomForestRegressor(random_state=0)),
    ("Linear Regression", LinearRegression()),
    ("SVR", SVR())
]

# specify each estimator's hyperparameters
param_sets = [
    # RFR hyperparameters
    {
        "n_estimators": [100, 250, 500, 750, 1000],
        "max_depth": [10, 50, 100]
    },
    # LR hyperparameters
    {
        "fit_intercept": [True, False]
    },
    {
        'kernel': ['linear'], 
         'C': [1, 10, 100, 1000]
    }
]

# run GridSearchCV for each estimator and report its perfomance on the test set
for (est, params) in zip(estimators, param_sets):
    print("Finding the best hyperparameters for %s ..." % est[0])
    
    model = GridSearchCV(
        est[1],
        param_grid=params,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1,
        verbose=1
    )
    
    # train
    model.fit(X_train, y_train)
    print("Best estimator parameters: ", model.best_params_)
    
    # evaluate on test set
    y_pred = model.predict(X_test)
    
    # print metrics
    print("--- Performance of %s ---" % est[0])
    print("\tMAE: %.2f" % metrics.mean_absolute_error(y_test, y_pred))
    print("\tMSE: %.2f" % metrics.mean_squared_error(y_test, y_pred))
    print("\tRMSE: %.2f" % metrics.mean_squared_error(y_test, y_pred, squared=False))
    print("\tR^2: %.2f" % metrics.r2_score(y_test, y_pred))
    print()

    # put actual and predicted temps side-by-side
    data_out = {
        "Actual": y_test,
        "Predicted": np.round(y_pred, 0)
    }
    
    # print to .csv file
    pd.DataFrame(data_out).to_csv("%s output.csv" % est[0], index=None)

Finding the best hyperparameters for Random Forest Regressor ...
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best estimator parameters:  {'max_depth': 10, 'n_estimators': 250}
--- Performance of Random Forest Regressor ---
	MAE: 28830720.37
	MSE: 4573106802583914.00
	RMSE: 67624749.93
	R^2: -0.70

Finding the best hyperparameters for Linear Regression ...
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best estimator parameters:  {'fit_intercept': False}
--- Performance of Linear Regression ---
	MAE: 29892499.27
	MSE: 3994224029073115.00
	RMSE: 63199873.65
	R^2: -0.48

Finding the best hyperparameters for SVR ...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best estimator parameters:  {'C': 1000, 'kernel': 'linear'}
--- Performance of SVR ---
	MAE: 14037640.55
	MSE: 2864999234259729.00
	RMSE: 53525687.61
	R^2: -0.06



- Conclusion: SVR performs the best

#### Comment:

Accuracy results is better than R-squared, suggesting that for this dataset, it's more appropriate to treat it as a classification problem instead of a regression problem.

If we treat it as a regression problem, we need to have more data or better features to improve the model's performance.

When it comes to treating it as a classification problem, SVM... performs the best with accuracy...
without any fine-tuning.

Next, fine-tuning the parameters separately. Then report the findings after optimisation.



In [13]:
"""
COMPARE THE PERFORMANCE OF THE LINEAR REGRESSION MODEL
VS.
A DUMMY MODEL (BASELINE) THAT USES MEAN AS THE BASIS OF ITS PREDICTION
"""

# Compute mean of values in (y) training set
y_base = np.mean(y_train)

# Replicate the mean values as many times as there are values in the test set
y_pred_base = [y_base] * len(y_test)


# Optional: Show the predicted values of (y) next to the actual values of (y)
df_base_pred = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_base})
print(df_base_pred)

# Compute standard performance metrics of the baseline model:

# Mean Absolute Error
mae = metrics.mean_absolute_error(y_test, y_pred_base)
# Mean Squared Error
mse = metrics.mean_squared_error(y_test, y_pred_base)
# Root Mean Square Error
rmse =  math.sqrt(metrics.mean_squared_error(y_test, y_pred_base))

# Normalised Root Mean Square Error
y_max = y.max()
y_min = y.min()
rmse_norm = rmse / (y_max - y_min)

# R-Squared
r_2 = metrics.r2_score(y_test, y_pred_base)

print("Baseline performance:")
print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("RMSE (Normalised): ", rmse_norm)
print("R^2: ", r_2)

       Actual     Predicted
0      254867  2.119989e+07
1     3122898  2.119989e+07
2     1476783  2.119989e+07
3      561991  2.119989e+07
4     5003937  2.119989e+07
..        ...           ...
190   7040725  2.119989e+07
191  66147869  2.119989e+07
192    819478  2.119989e+07
193     26596  2.119989e+07
194   1414677  2.119989e+07

[195 rows x 2 columns]
Baseline performance:
MAE:  25886045.14708369
MSE:  2739056805226860.0
RMSE:  52335999.13278488
RMSE (Normalised):  0.06771344193702854
R^2:  -0.017465919883335346
