In [64]:
# Feature Selection example for ENGR400
# Prepared by Dr. Raju Gottumukkala

import pandas as pd # Python module to work with tabular data
import numpy as np # Python module which supports MATLAB like matrix operation
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge # Linear Regression, Stocastic Gradient Decent
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error # Python modules to calculate different error metrics
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from matplotlib import pyplot as plt # Python plotting functions
import seaborn as sns # Special purpose plotting function
import yfinance as yf


In [65]:
# These are subfunctions to calculate rolling average of mean and standard deviation for  window length 'w'
#df is the data frame, col is the feature, and w is the window length
def rollm(df, w, col):
    return df[col].rolling(w).mean().shift(1)

def rolls(df, w, col):
    return df[col].rolling(w).std().shift(1)

In [66]:
def modelf (X_train, X_test, y_train, y_test):

    svr = SVR(kernel = 'sigmoid', C=0.1, gamma=0.1)
    svr.fit(X_train, y_train)
    ysvr_pred= svr.predict(X_test)                            #

    rfr = RandomForestRegressor()
    rfr.fit(X_train, y_train)
    yrfr_pred= rfr.predict(X_test)                           #

    regr = LinearRegression() 
    regr.fit(X_train, y_train)
    yLR_pred= regr.predict(X_test)
    R_LR = np.corrcoef(y_test, yLR_pred)
    
    
    # Calculate performance scores
    MAE_svr=mean_absolute_error(y_test,ysvr_pred)
    MSE_svr=mean_squared_error(y_test,ysvr_pred)
    MAPE_svr=mean_absolute_percentage_error(y_test,ysvr_pred)


    MAE_rfr=mean_absolute_error(y_test,yrfr_pred)
    MSE_rfr=mean_squared_error(y_test,yrfr_pred)
    MAPE_rfr=mean_absolute_percentage_error(y_test,yrfr_pred)

    MAE_LR=mean_absolute_error(y_test, yLR_pred)
    MSE_LR=mean_squared_error(y_test, yLR_pred)
    MAPE_LR=mean_absolute_percentage_error(y_test, yLR_pred)
    
        
    return np.array([[MAE_svr ,MAE_rfr ,MAE_LR],[MSE_svr ,MSE_rfr ,MSE_LR],[MAPE_svr ,MAPE_rfr ,MAPE_LR]])



In [67]:
#STEP 0a: Read data
# Here we are reading data and asking python to index the column as a date
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

# df = pd.read_csv('TSLA.csv',index_col=0,parse_dates=["Date"])
df = yf.download('CVX','2000-06-01','2022-09-29')
#print(df.to_string())
df.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-06-01,46.21875,46.21875,44.71875,45.5,20.054611,2455200
2000-06-02,45.0,45.28125,44.40625,45.28125,19.958187,4548400
2000-06-05,45.375,45.625,45.125,45.21875,19.930641,2703200
2000-06-06,45.25,46.59375,45.21875,46.4375,20.467833,2582000
2000-06-07,46.9375,46.96875,45.84375,46.1875,20.357634,3026200


In [68]:
# step1: Data pre-processing
data = df.dropna(axis=0)
df.size

33708

In [69]:
# step 2a: Feature extraction/computation

#data = generate_new_features(df)
 # We will store the new features in this dataframe

df_new = pd.DataFrame()
df_new['Open'] = df['Open']

# Next day variables (note the shift operator)      #

# df_new['open+1'] = df['Open'].shift(-1)             #
df_new['close+1'] = df['Close'].shift(-1)           #
df_new['high+1'] = df['High'].shift(-1)             #
df_new['low+1'] = df['Low'].shift(-1)               #
df_new['volume+1'] = df['Volume'].shift(-1)         #

# Previous day variables (note the shift operator)

df_new['open_1'] = df['Open'].shift(1)
df_new['close_1'] = df['Close'].shift(1)
df_new['high_1'] = df['High'].shift(1)
df_new['low_1'] = df['Low'].shift(1)
df_new['volume_1'] = df['Volume'].shift(1)

# avergae price
df_new['avg_price_5'] = rollm(df, 5, 'Open')
df_new['avg_price_30'] = rollm(df, 21, 'Open') 
df_new['avg_price_365'] = rollm(df, 252, 'Open')
df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']       ##
df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']     ##
df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']   ##

# average volume                                                                      ##
df_new['avg_volume_5'] = rollm(df, 5, 'Volume')
df_new['avg_volume_30'] = rollm(df, 21, 'Volume')
df_new['avg_volume_365'] = rollm(df, 252, 'Volume') 
df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']
df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']
df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']


# Valume Std. Dev. 
df_new['std_volume_5'] = rolls(df, 5, 'Volume')     #
df_new['std_volume_30'] = rolls(df, 21, 'Volume')                                       ##
df_new['std_volume_365'] = rolls(df, 252, 'Volume')                                     ##
df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']      ##
df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']    ##
df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']  ##

# standard deviation of prices                     ##
df_new['std_price_5'] = rolls(df, 5, 'Close')
df_new['std_price_30'] = rolls(df, 21, 'Close')
df_new['std_price_365'] = rolls(df, 252, 'Close')
df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']
df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']
df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']

# return                                           ##
df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)
df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)
df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)
df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)
df_new['moving_avg_5'] = rollm(df_new, 5, 'return_5')
df_new['moving_avg_30'] = rollm(df_new, 21, 'return_30') 
df_new['moving_avg_365'] = rollm(df_new, 252, 'return_365')



# The target variable       #
df_new['open+1'] = df['Open'].shift(-1)            #

# Note that we will have some null values and nan, we have to clearn them

#check how many nan values
print("rows with null values=",df_new.isnull().sum().sum())

#drop all the naans and round data to 3 digits
data = df_new.dropna(axis=0)
data = df_new.round(decimals=3)

# Print the open values to make sure we understand the shift operator and target variable
data[['Open','open_1', 'open+1']].head(10)



rows with null values= 4064


Unnamed: 0_level_0,Open,open_1,open+1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-01,46.219,,45.0
2000-06-02,45.0,46.219,45.375
2000-06-05,45.375,45.0,45.25
2000-06-06,45.25,45.375,46.938
2000-06-07,46.938,45.25,46.219
2000-06-08,46.219,46.938,46.5
2000-06-09,46.5,46.219,46.406
2000-06-12,46.406,46.5,46.0
2000-06-13,46.0,46.406,46.031
2000-06-14,46.031,46.0,46.531


In [70]:
# step 2a: Feature extraction/computation

# Understand the range of values for each feature
df_new.describe()


Unnamed: 0,Open,close+1,high+1,low+1,volume+1,open_1,close_1,high_1,low_1,volume_1,...,ratio_std_price_5_365,ratio_std_price_30_365,return_1,return_5,return_30,return_365,moving_avg_5,moving_avg_30,moving_avg_365,open+1
count,5618.0,5617.0,5617.0,5617.0,5617.0,5617.0,5617.0,5617.0,5617.0,5617.0,...,5366.0,5366.0,5616.0,5612.0,5596.0,5365.0,5607.0,5575.0,5113.0,5617.0
mean,86.523671,86.537103,87.352382,85.675544,8288285.0,86.51378,86.51925,87.33454,85.658332,8287248.0,...,0.188193,0.365136,0.000356,0.001708,0.007185,0.07953,0.001748,0.007439,0.06926,86.530847
std,31.200147,31.209005,31.426711,30.962409,4318687.0,31.194115,31.203791,31.421608,30.958299,4319388.0,...,0.129444,0.211419,0.017574,0.036315,0.070806,0.214758,0.029308,0.055138,0.165575,31.198289
min,31.110001,30.924999,31.67,30.655001,1067000.0,31.110001,30.924999,31.67,30.655001,1067000.0,...,0.008554,0.069025,-0.221248,-0.336987,-0.502614,-0.560584,-0.252548,-0.309768,-0.284382,31.110001
25%,59.185,59.25,59.740002,58.5,5384000.0,59.18,59.240002,59.720001,58.490002,5383700.0,...,0.102304,0.216876,-0.008065,-0.016907,-0.031928,-0.049068,-0.013542,-0.023444,-0.046431,59.200001
50%,89.945,89.919998,91.029999,88.809998,7238400.0,89.940002,89.910004,90.980003,88.800003,7238300.0,...,0.156549,0.317191,0.000693,0.002717,0.008255,0.078723,0.00265,0.007496,0.086164,89.949997
75%,111.529999,111.580002,112.349998,110.68,10047800.0,111.529999,111.580002,112.339996,110.599998,10047800.0,...,0.234206,0.464857,0.008945,0.021584,0.047431,0.212182,0.018178,0.040034,0.182285,111.529999
max,180.949997,181.130005,182.399994,180.25,57231000.0,180.949997,181.130005,182.399994,180.25,57231000.0,...,1.671183,1.912089,0.227407,0.330894,0.58347,0.891372,0.232624,0.250792,0.494462,180.949997


In [71]:
# step 2b: Feature standardization / normalization

# store all the data in the feature table data frame named 'ft'
ft = data[['avg_price_5','std_volume_5','avg_price_30','close_1','volume_1','avg_price_365','ratio_avg_price_5_30','ratio_avg_price_5_365','ratio_avg_price_30_365'
           ,'avg_volume_5','avg_volume_30','avg_volume_365','ratio_avg_volume_5_30','ratio_avg_volume_5_365','ratio_avg_volume_30_365','std_volume_30','std_volume_365'
           ,'ratio_std_volume_5_30','ratio_std_volume_5_365','ratio_std_volume_30_365','std_price_5','std_price_30','std_price_365','ratio_std_price_5_30'
           ,'ratio_std_price_5_365','ratio_std_price_30_365','return_1','return_5','return_30','return_365','moving_avg_5','moving_avg_30','moving_avg_365'
           ,'close+1','high+1','low+1','volume+1','open_1','high_1','low_1','Open','open+1']]

len_idx = len(data.index)
print(len_idx)

#drop all the naans and round data to 3 digits
ft = ft.dropna(axis=0)
ft = ft.round(decimals=3)

#seperate the data into training and testing sets
ft2_X =ft.drop(['open+1', 'close+1', 'low+1', 'high+1'], axis=1)
ft2_Y =ft['open+1']

#x = ft.values #returns a numpy 
min_max_scaler = preprocessing.MinMaxScaler()
# standard_scaler =  preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(ft2_X)
#x_scaled = standard_scaler.fit_transform(ft)

ft2 = pd.DataFrame(x_scaled,columns=ft2_X.columns)

5618


In [127]:
# step 3: Fit the model with all the features (Without any selection)

X_train, X_test, y_train, y_test =train_test_split(ft2_X,ft2_Y,test_size=0.3, shuffle=0.1)
all_array = modelf (X_train, X_test, y_train, y_test)
all_feat = pd.DataFrame(my_array,index=['mean_absolute_error' ,'mean_squared_error' ,'mean_absolute_percentage_error'],columns=['SVR' ,'RFR' ,'LR'])


In [151]:
# step 2c: Feature selection with variance threshold

from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=.05)
selector.fit(X)
X_V=X.columns[selector.get_support()]

print("SELECTED FEATURES=",X_V)

#print(X[X_V].info())
X_train, X_test, y_train, y_test =train_test_split(X[X_V],Y,test_size=0.3, shuffle=0.1)
my_array2 = modelf (X_train, X_test, y_train, y_test)
sel_feat = pd.DataFrame(my_array2,index=['mean_absolute_error' ,'mean_squared_error' ,'mean_absolute_percentage_error'],columns=['SVR' ,'RFR' ,'LR'])

print("\nPERFORMANCE WITH ALL FEATURES\n")
print(all_feat)
print(" \nPERFORMANCE WITH ", len(X_V),"FEATURES")
print(sel_feat)

SELECTED FEATURES= Index(['avg_price_365', 'avg_volume_365', 'std_volume_365'], dtype='object')

PERFORMANCE WITH ALL FEATURES

                                         SVR           RFR            LR
mean_absolute_error             5.762633e-02  8.328509e-03  4.864857e-02
mean_squared_error              6.057956e-03  2.641730e-04  4.928911e-03
mean_absolute_percentage_error  3.297115e+11  1.666113e+10  1.499604e+11
 
PERFORMANCE WITH  3 FEATURES
                                     SVR       RFR        LR
mean_absolute_error             0.060104  0.008435  0.051671
mean_squared_error              0.006641  0.000196  0.005409
mean_absolute_percentage_error  0.515685  0.030425  0.273743


In [176]:
# step 2c: Feature selection with mutual info

from sklearn.feature_selection import SelectKBest, mutual_info_regression
# #Select top k features based on mutual info regression

mutual_info = mutual_info_regression(X,Y)
selector = SelectKBest(mutual_info_regression, k =2)
selector.fit(X, Y)
X.columns[selector.get_support()]

mutual_info=pd.Series(mutual_info)
mutual_info.index = X.columns
T = mutual_info.sort_values(ascending=False)

# change k to change the features
k=10
print(T[:k])
#X[T[:].index]
F = X[T[:k].index]
X_train, X_test, y_train, y_test =train_test_split(F,Y,test_size=0.3, shuffle=0.1)
my_array2 = modelf (X_train, X_test, y_train, y_test)
sel_feat = pd.DataFrame(my_array2,index=['mean_absolute_error' ,'mean_squared_error' ,'mean_absolute_percentage_error'],columns=['SVR' ,'RFR' ,'LR'])

print("\nPERFORMANCE WITH ALL FEATURES\n")
print(all_feat)
print(" \nPERFORMANCE WITH ", k,"FEATURES")
print(sel_feat)


Open              3.017080
close_1           2.891619
high_1            2.818157
low_1             2.789252
open_1            2.696200
avg_price_5       2.509278
avg_price_30      2.164278
avg_price_365     1.955140
std_volume_365    1.677398
avg_volume_365    1.602198
dtype: float64

PERFORMANCE WITH ALL FEATURES

                                         SVR           RFR            LR
mean_absolute_error             5.762633e-02  8.328509e-03  4.864857e-02
mean_squared_error              6.057956e-03  2.641730e-04  4.928911e-03
mean_absolute_percentage_error  3.297115e+11  1.666113e+10  1.499604e+11
 
PERFORMANCE WITH  10 FEATURES
                                     SVR       RFR        LR
mean_absolute_error             0.038399  0.007368  0.006923
mean_squared_error              0.002032  0.000122  0.000106
mean_absolute_percentage_error  0.135620  0.073381  0.098543


In [287]:
# step 2c: Feature selection with sequential feature selector

from sklearn.feature_selection import SequentialFeatureSelector

K=5

# Instantiate the estimator and the model
regr = LinearRegression() 
sfs = SequentialFeatureSelector(regr,n_features_to_select=K)

# Fit the data to determine the k_features which give the
# most optimal model performance
sfs.fit(X,Y)


#print(sfs.get_support())
# Transform the training data set to dataset having k_features
# giving most optimal model performance

print(list(X.columns[sfs.get_support()]))

F=X[X.columns[sfs.get_support()]]

X_train, X_test, y_train, y_test =train_test_split(F,Y,test_size=0.3, shuffle=0.1)

my_array2 = modelf (X_train, X_test, y_train, y_test)
sel_feat = pd.DataFrame(my_array2,index=['mean_absolute_error' ,'mean_squared_error' ,'mean_absolute_percentage_error'],columns=['SVR' ,'RFR' ,'LR'])

print("\nPERFORMANCE WITH ALL FEATURES\n")
print(all_feat)
print(" \nPERFORMANCE WITH ", K,"FEATURES")
print(sel_feat)


['std_price_30', 'return_5', 'moving_avg_5', 'moving_avg_30', 'Open']

PERFORMANCE WITH ALL FEATURES

                                         SVR           RFR            LR
mean_absolute_error             5.762633e-02  8.328509e-03  4.864857e-02
mean_squared_error              6.057956e-03  2.641730e-04  4.928911e-03
mean_absolute_percentage_error  3.297115e+11  1.666113e+10  1.499604e+11
 
PERFORMANCE WITH  5 FEATURES
                                     SVR       RFR        LR
mean_absolute_error             0.049878  0.007682  0.007067
mean_squared_error              0.003446  0.000115  0.000099
mean_absolute_percentage_error  0.596324  0.032035  0.030616


In [182]:
print(X[X.columns[sfs.get_support()]])

      std_price_30  return_5  moving_avg_5  moving_avg_30      Open
0         0.018588  0.504491      0.483539       0.582888  0.083022
1         0.020515  0.501497      0.495885       0.579323  0.082355
2         0.022069  0.497006      0.506173       0.575758  0.081854
3         0.019707  0.497006      0.510288       0.573975  0.081854
4         0.016971  0.520958      0.512346       0.563280  0.081854
...            ...       ...           ...            ...       ...
5107      0.181649  0.474551      0.537037       0.614973  0.848905
5108      0.188052  0.428144      0.522634       0.606061  0.844167
5109      0.193584  0.450599      0.481481       0.598930  0.791778
5110      0.261780  0.392216      0.454733       0.586453  0.750534
5111      0.332774  0.351796      0.432099       0.568627  0.741991

[5112 rows x 5 columns]


In [288]:
# step 2c: Feature selection with RFE based feature selector

from sklearn.feature_selection import RFE
K=10
regr = LinearRegression() 
rfe = RFE(regr, n_features_to_select=K)
rfe.fit(X,Y)

#print(rfe.get_support())
# Transform the training data set to dataset having k_features
# giving most optimal model performance

print(list(X.columns[rfe.get_support()]))

F=X[X.columns[sfs.get_support()]]

X_train, X_test, y_train, y_test =train_test_split(F,Y,test_size=0.3, shuffle=0.1)

my_array2 = modelf (X_train, X_test, y_train, y_test)
sel_feat = pd.DataFrame(my_array2,index=['mean_absolute_error' ,'mean_squared_error' ,'mean_absolute_percentage_error'],columns=['SVR' ,'RFR' ,'LR'])

print("\nPERFORMANCE WITH ALL FEATURES\n")
print(all_feat)
print(" \nPERFORMANCE WITH ", K,"FEATURES")
print(sel_feat)


['avg_price_5', 'close_1', 'volume_1', 'ratio_avg_price_5_30', 'ratio_avg_price_5_365', 'ratio_avg_price_30_365', 'volume+1', 'high_1', 'low_1', 'Open']

PERFORMANCE WITH ALL FEATURES

                                         SVR           RFR            LR
mean_absolute_error             5.762633e-02  8.328509e-03  4.864857e-02
mean_squared_error              6.057956e-03  2.641730e-04  4.928911e-03
mean_absolute_percentage_error  3.297115e+11  1.666113e+10  1.499604e+11
 
PERFORMANCE WITH  10 FEATURES
                                     SVR       RFR        LR
mean_absolute_error             0.052428  0.007474  0.006889
mean_squared_error              0.003761  0.000123  0.000107
mean_absolute_percentage_error  1.322868  0.088724  0.096918


In [280]:
# step 2c: Feature selection with RF approach (using GINI index)

rfr = RandomForestRegressor()
rfr.fit(X, Y)
importances = rfr.feature_importances_
indices = np.argsort(importances)

k=12

X1 = X.columns[sorted_idx]
X2 = rfr.feature_importances_[sorted_idx]
A = pd.DataFrame(columns=['X1','X2'])
A.X1=X1
A.X2=X2
B=A.sort_values('X2', axis=0,ascending=False).reset_index()
C=B.X1[0:K].values.tolist()

X_train, X_test, y_train, y_test =train_test_split(X[C],Y,test_size=0.3, shuffle=0.1)

my_array2 = modelf (X_train, X_test, y_train, y_test)
sel_feat = pd.DataFrame(my_array2,index=['mean_absolute_error' ,'mean_squared_error' ,'mean_absolute_percentage_error'],columns=['SVR' ,'RFR' ,'LR'])

print("\nPERFORMANCE WITH ALL FEATURES\n")
print(all_feat)
print(" \nPERFORMANCE WITH ", k,"FEATURES")
print(sel_feat)


#plt.barh(X.columns, rfr.feature_importances_)

# sorted_idx = rfr.feature_importances_.argsort()
# plt.barh(X.columns[sorted_idx], rfr.feature_importances_[sorted_idx])
# plt.xlabel("Random Forest Feature Importance")

# sorted_idx = rfr.feature_importances.argsort()
# rfr.feature_importances_[sorted_idx]



PERFORMANCE WITH ALL FEATURES

                                         SVR           RFR            LR
mean_absolute_error             5.762633e-02  8.328509e-03  4.864857e-02
mean_squared_error              6.057956e-03  2.641730e-04  4.928911e-03
mean_absolute_percentage_error  3.297115e+11  1.666113e+10  1.499604e+11
 
PERFORMANCE WITH  12 FEATURES
                                         SVR           RFR            LR
mean_absolute_error             4.137171e-02  7.475027e-03  6.808347e-03
mean_squared_error              2.340014e-03  1.076066e-04  9.243188e-05
mean_absolute_percentage_error  2.278100e+11  1.861262e+10  1.578077e+10


In [283]:
from sklearn.decomposition import PCA
# define transform
pca = PCA(n_components=5)

#pca = PCA(n_components = 5)
# prepare transform on dataset
pca.fit(X)
# apply transform to dataset
transformed = pca.transform(X)
data_pca = pd.DataFrame(transformed,columns=['PC1','PC2','PC3','PC4','PC5'])
data_pca.head()

X_train, X_test, y_train, y_test =train_test_split(data_pca,Y,test_size=0.3, shuffle=0.1)

my_array2 = modelf (X_train, X_test, y_train, y_test)
sel_feat = pd.DataFrame(my_array2,index=['mean_absolute_error' ,'mean_squared_error' ,'mean_absolute_percentage_error'],columns=['SVR' ,'RFR' ,'LR'])

print("\nPERFORMANCE WITH ALL FEATURES\n")
print(all_feat)
print(" \nPERFORMANCE WITH ", '5',"FEATURES")
print(sel_feat)



PERFORMANCE WITH ALL FEATURES

                                         SVR           RFR            LR
mean_absolute_error             5.762633e-02  8.328509e-03  4.864857e-02
mean_squared_error              6.057956e-03  2.641730e-04  4.928911e-03
mean_absolute_percentage_error  3.297115e+11  1.666113e+10  1.499604e+11
 
PERFORMANCE WITH  5 FEATURES
                                     SVR       RFR        LR
mean_absolute_error             0.046543  0.009084  0.012051
mean_squared_error              0.002863  0.000169  0.000273
mean_absolute_percentage_error  0.956350  0.094490  0.065482


                                         SVR           RFR            LR
mean_absolute_error             4.381486e-02  8.636371e-03  1.188400e-02
mean_squared_error              2.779750e-03  1.682787e-04  3.074177e-04
mean_absolute_percentage_error  2.020490e+11  4.170131e+10  4.682058e+10

**after feature selection using variance threshold**

                                     SVR       RFR        LR
mean_absolute_error             0.040470  0.008344  0.011891
mean_squared_error              0.002473  0.000158  0.000299
mean_absolute_percentage_error  0.274373  0.032087  0.045612
