In [1]:
# KPMG Forecasting Project
# Support Vector Regression
# Author: Taotao Jiang
# Date: June 21, 2021

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('C:/Users/jtt/Desktop/KPMG/data_michelle.csv')
df['date'] = df[['year','quarter']].apply(lambda x : '{}q{}'.format(x[0],x[1]), axis=1)
df.drop(['year','quarter'], inplace=True, axis=1, errors='ignore')
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df = df.set_index('date')

In [3]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)
df = clean_dataset(df)

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [4]:
split = 0.3
y = df.dv
x = df.drop(['dv'], axis=1)
i = int(len(x)*(1-split))
x_train = x.iloc[:i]
y_train = y.iloc[:i]
x_test = x.iloc[i:]
y_test = y.iloc[i:]
# Z-score the features
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [5]:
# linear kernel
linear_svr = SVR(kernel='linear')
linear_svr.fit(x_train_scaled, y_train)
linear_svr_y_predict = linear_svr.predict(x_test_scaled)
 
# Poly kernel
poly_svr = SVR(kernel='poly')
poly_svr.fit(x_train_scaled, y_train)
poly_svr_y_predict = poly_svr.predict(x_test_scaled)
 
# rbf kernel
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(x_train_scaled, y_train)
rbf_svr_y_predict = rbf_svr.predict(x_test_scaled)

print ('Mean_absolute_percentage_error of linear SVR is', mean_absolute_percentage_error(y_test, linear_svr_y_predict))
print ('The mean squared error of linear SVR is', mean_squared_error(y_test, linear_svr_y_predict))
print ('The mean absoluate error of linear SVR is', mean_absolute_error(y_test, linear_svr_y_predict))

print ('Mean_absolute_percentage_error of Poly SVR is', mean_absolute_percentage_error(y_test, poly_svr_y_predict))
print ('The mean squared error of Poly SVR is', mean_squared_error(y_test, poly_svr_y_predict))
print ('The mean absoluate error of Poly SVR is', mean_absolute_error(y_test, poly_svr_y_predict))

print ('Mean_absolute_percentage_error of RBF SVR is', mean_absolute_percentage_error(y_test, rbf_svr_y_predict))
print ('The mean squared error of RBF SVR is', mean_squared_error(y_test, rbf_svr_y_predict))
print ('The mean absoluate error of RBF SVR is', mean_absolute_error(y_test, rbf_svr_y_predict))    

Mean_absolute_percentage_error of linear SVR is 608.522449819207
The mean squared error of linear SVR is 0.7083673179914985
The mean absoluate error of linear SVR is 0.3930673536789827
Mean_absolute_percentage_error of Poly SVR is 541.0399631348165
The mean squared error of Poly SVR is 17.19491618639979
The mean absoluate error of Poly SVR is 1.223691556781063
Mean_absolute_percentage_error of RBF SVR is 196.81399496024147
The mean squared error of RBF SVR is 0.09201762189586334
The mean absoluate error of RBF SVR is 0.19905414193432144
