# Model Development - OSL model

In [12]:
import pandas as pd

import statsmodels.api as sm

from datetime import datetime

import matplotlib.pyplot as plt

from sklearn.metrics import r2_score,accuracy_score,mean_squared_error

from math import sqrt

from statsmodels.tsa.vector_ar.vecm import coint_johansen

from sklearn.model_selection import train_test_split

# Input the data in the CSV format

df1 = pd.read_csv('HistoricalData.csv',parse_dates = True)

# Converting "date" Columns to 'DATETIME' index format
Date = []            # Creating an empty list

for i in df1['date']: 
    
    my_date = datetime.strptime(str(i), "%Y%m") #Converting into datetime format
 
    Date.append(my_date)

df2 = pd.DataFrame(Date)

df2.columns = ['Date'] # Final 'Date' column with required format

df = pd.concat([df2,df1],axis = 1)

df.drop('date',axis = 1 ,inplace = True) # Dropping old date column

# Converting Monthly data to Quaterly date

df['Qtr'] = pd.to_datetime(df['Date'].values,
                           format='%Y-%m').astype('period[Q]')

df = (df.groupby(pd.PeriodIndex(df['Date'], freq='Q'), axis=0)
                  .mean())

df = df.dropna()  # Dropping Nan values

# Data splitting into train and test

X = df.drop('VIX',axis =1) # Independent Features

y = df['VIX'] # Target or Dependable Feature

# Spiliting data into training and testing

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,
                                                 random_state = 6)

# Fitting Linear model

#X = sm.add_constant(X_train)

model = sm.OLS(y_train,X_train)

result = model.fit()

result.summary()

# Prediction VALUES FOR X_test

y_test_predicts = result.predict(X_test)

# Calculating Root Mean Square (RMSE) and R2-Value

rms = sqrt(mean_squared_error(y_test,y_test_predicts))

r2_score = r2_score(y_test,y_test_predicts)

# Printing The values

print("The coefficients are :\n")

print(result.params)

print('The RMSE is :',rms)

print('The R2-Value is:',r2_score)

The coefficients are :

SP500      0.002560
NASDAQ    -0.007412
LIBOR3M    5.467375
CPI       -0.020006
HPI        0.204270
CDS5Y      0.141404
dtype: float64
The RMSE is : 2.4326432397658615
The R2-Value is: 0.8392880085125279


- We see that coefficients are calculated above.Further,we can conclude based on this observation that two variable 
viz.'NASDAQ' and 'CPI' have negative relationship with VIX index.


- The above model has a value of RMSE 2.43


- The goodness of fit is about 84%,which shows the good fitting based on predicted values.

# Model Implementation

In [93]:
import numpy as np

import pandas as pd

from statsmodels.iolib.smpickle import load_pickle


class MacroVariable:
    def __init__(self,
                 my_path,
                 metadata_file,
                 scenario):
     
        self.my_path = my_path
        
        self.metadata_file = metadata_file
        
        self.scenario = scenario


    def get_initial_values(self, variables):  # method to build!!!
      
        #print('entered get_initial_values')
        scen_values=pd.DataFrame()    
        
        dataframe =pd.read_csv(self.scenario)
        
        #print(dataframe)
        #print(isinstance(variables, str))
        
        if isinstance(variables, str):
           # print('entered if')
            
           # print(variables)
          
            scen_values=dataframe.iloc[np.where(dataframe['Variable']==variables)]
            
        if isinstance(variables, tuple):
            #print('entered else')
            
            variables=list(variables)
            
            for variable in variables :
                #print(type(variable))
                
                scen_values=pd.concat(scen_values,dataframe[dataframe['Variable']==variable],ignore_index=True)
        #print('exited get_initial_values')
        
        return scen_values

    

class VIX(MacroVariable):
    def __init__(self,
                 my_path,
                 metadata_file,
                 scenario):  # method to build!!!
       
        super().__init__(my_path, metadata_file, scenario)
        
        self.indep_variable = ("SP500","NASDAQ", "LIBOR3M","CPI","HPI","CDS5Y")  # examples of explanatory variables, to change/build!!!


    def get_coefficients(self, coefficients):  # method to build!!!
    
        #coefficient_path=self.coefficients
        #print('entered get_coefficients')
        
        coefficients_matrix = load_pickle(coefficients)
        
        return coefficients_matrix

    def get_value(self, variable):  # method to build!!!
      
        results=self.get_coefficients("C:/Users/nitin/Dropbox/PC/Downloads/model.pickle")
        
        l=len(self.indep_variable)
        
        x=np.empty([0,10])
        
        for var in self.indep_variable:
            
            df = self.get_initial_values(var)
            #print(df)
            x=np.vstack((x,np.asarray(df['Value'])))
            
            z=np.asarray(df['TimeStamp'])
       #print(x)
       # x=np.reshape(x, (10,)).T
        #print(x.shape)
        Y= results.predict(x.T)
        
        predictions=list(Y)
        
        l=[str(u)+'Q' for u in range(1,10)]
        
        var=[variable for u in range(0,10)]
        l.insert(0,'0')
       # x=list(x)
        Y=pd.DataFrame(list(zip(var,l,predictions)),columns=['Variable','TimeStamp','VIX'])
        
        return Y

if __name__ == '__main__':
    
    my_path = "C:/Users/nitin/OneDrive/Quant Assignment[656]/Quant Assignment/EconomicScenario.csv"
    
    metadata_file = "C:/Users/nitin/OneDrive/Quant Assignment[656]/Quant Assignment/EconomicScenario.csv"
    
    scenario = "C:/Users/nitin/OneDrive/Quant Assignment[656]/Quant Assignment/EconomicScenario.csv"
    
    vix_model = VIX(my_path, metadata_file, scenario)
    
    Y=vix_model.get_value('VIX')
    
    Y=Y.drop('Variable',axis = 1)
    
    Y.index = Y.TimeStamp
    
Y = Y.drop('TimeStamp',axis = 1)

print('The Output of OSL-Model is:')

Y

The Output of OSL-Model is:


Unnamed: 0_level_0,VIX
TimeStamp,Unnamed: 1_level_1
0,23.276488
1Q,39.774727
2Q,42.438568
3Q,41.225141
4Q,36.68585
5Q,31.430648
6Q,26.409331
7Q,21.777895
8Q,17.221134
9Q,14.566825


- Since, the model is predicting good but still we will see other model such as linear regression, lasso ,ridge and Elastic net.


- Finally, we will compare the RMSE value and R2 score to get the best model for the prediction.

# IMPLEMENTING -  'LINEAR REGRESSION MODEL'

In [59]:
from sklearn import model_selection

import numpy as np

import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_squared_error

from math import sqrt

xtrain,xtest,ytrain,ytest = model_selection.train_test_split(X,y,test_size=0.2,random_state=6)

lin = LinearRegression()

lin.fit(xtrain, ytrain)

#print(lin.coef_)

lin.intercept_

predictions = lin.predict(xtest)

#print(sqrt(mean_squared_error(ytest, predictions)))

r2_score = r2_score(y_test,predictions)

print('Mean-Squared-Error:',sqrt(mean_squared_error(ytest, predictions)))

print('The R2 Score is:',r2_score)

Mean-Squared-Error: 2.3598013662529795
The R2 Score is: 0.8487684737034524


# IMPLEMENTING -  'LASSO MODEL'

In [60]:
from sklearn.linear_model import Lasso

lassoreg = Lasso(alpha=0.001, normalize=True)

lassoreg.fit(xtrain, ytrain)

#print(sqrt(mean_squared_error(ytrain, lassoreg.predict(xtrain))))
print('Mean-Squared-Error:',sqrt(mean_squared_error(ytest, lassoreg.predict(xtest))))

print('R2 Value/Coefficient of Determination: {}'.format(lassoreg.score(xtest, ytest)))

Mean-Squared-Error: 2.1546493831177242
R2 Value/Coefficient of Determination: 0.873920447837161


# IMPLEMENTING -  'RIDGE MODEL'

In [61]:
from sklearn.linear_model import Ridge

ridgeReg = Ridge(alpha=0.001, normalize=True)

ridgeReg.fit(xtrain,ytrain)
#print(sqrt(mean_squared_error(ytrain, ridgeReg.predict(xtrain))))
print('Mean-Squared-Error:',sqrt(mean_squared_error(ytest, ridgeReg.predict(xtest))))

print('R2 Value/Coefficient of Determination: {}'.format(ridgeReg.score(xtest, ytest)))

Mean-Squared-Error: 2.1758734039398115
R2 Value/Coefficient of Determination: 0.8714243624946346


# IMPLEMENTING -  'ELASTIC NET'

In [89]:
from sklearn.linear_model import ElasticNet

Elas = ElasticNet(alpha=0.001, normalize=True)

Elas.fit(xtrain, ytrain)

#print(sqrt(mean_squared_error(ytrain, Elas.predict(xtrain))))

print('Mean-Squared-Error:',sqrt(mean_squared_error(ytest, Elas.predict(xtest))))

print('R2 Value/Coefficient of Determination: {}'.format(Elas.score(xtest, ytest)))

Mean-Squared-Error: 1.915034032745153
R2 Value/Coefficient of Determination: 0.9004034186463632


In [69]:
model = {'Model':['OLS','Linear Regression','Lasso','Ridge','ElasticNet'],'R2-Score':[0.8392,0.8487,0.8739,0.8714,0.9004],
         'MSE':[2.4326,2.3598,2.1546,2.1758,1.9150]}

best_model = pd.DataFrame(model)

best_model

Unnamed: 0,Model,R2-Score,MSE
0,OLS,0.8392,2.4326
1,Linear Regression,0.8487,2.3598
2,Lasso,0.8739,2.1546
3,Ridge,0.8714,2.1758
4,ElasticNet,0.9004,1.915


- Therefore,we see 'ElasticNet model' is the best model among all other models.