## CQF Exam 3 - Machine Learning 

In [1]:
import project_path
from builtins import staticmethod
from abc import abstractmethod, ABCMeta
from sklearn import linear_model, svm, model_selection
from sklearn.metrics import confusion_matrix, classification_report
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import logging.config
import os
import json
%cd /home/nilesh/workspace/CQFMachineLearning/
%matplotlib inline

/home/nilesh/workspace/CQFMachineLearning


In [2]:
from main import ConfigurationFactory, DataModel, LogisticalRegression, SupportVectorMachine, GraphLib, SupervisedLearning

In [3]:
ConfigurationFactory._configure_log()
log = logging.getLogger("cqf_logger")
log.info("Initialising Program For CQF Exam 3 Machine Learning with Python")
#initialise data model
data = DataModel()

20191106 23:50:31 - INFO : Initialising Program For CQF Exam 3 Machine Learning with Python


In [4]:
class LogisticalRegression(SupervisedLearning):
    
    def __init__(self):
        super(LogisticalRegression, self).__init__()
        self._init_classifier()

    def _init_classifier(self):
        self.lm = linear_model.LogisticRegression(C = 1e5)
    
    def fit_model(self, x_param, y_param):
        log.info("Fitting model against specified parameters")
        self.lm.fit(x_param, y_param)
        
    def run_classifier(self, data):
        log.info("Running Logisitical Regression Classifier")
        #perform the regression on returns here
        returns_sign = np.sign(data.model.log_return)
        lagged_headers = [header for header in list(data.model) if "lagged" in header]
        #fitting done here
        self.fit_model(data.model[lagged_headers], returns_sign)
        self.run_prediction(data.model[lagged_headers])
        data.model["logistic_return"] = abs(data.model.log_return) * data.model.log_pred
        
        self.output_results()
        self.split_training_set(lagged_headers, data.model.log_pred)

    def run_prediction(self, model):
        log.info("Running Logistical Regression Prediction")
        data.model["log_pred"] = self.lm.predict(model)
    
    def split_training_set(self, x_features, y_param):
        #Test on training set
        self.x_train, self.x_test, self.y_train, self.y_test = model_selection.train_test_split(data.model[x_features], y_param,
                                                                            test_size=0.7, shuffle=False)
        
        print("Running second logistic classifier on training set")
        logit2 = linear_model.LogisticRegression(C = 1e5)
        logit2.fit(self.x_train, self.y_train)
        y_pred = logit2.predict(self.x_test)
        self.c_matrix = confusion_matrix(self.y_test, y_pred)
        print(self.c_matrix)
    
    def output_results(self):
        print("Logistic Return Prediction Data Summary: {}".format(data.model.log_pred.value_counts()))
        print("Logistic Regression data model: {}, with size {}".format(data.model.head(), len(data.model.log_pred)))
        
        

class SupportVectorMachine(SupervisedLearning):
    
    def __init__(self):
        super(SupportVectorMachine, self).__init__()
        self._init_classifier()

    def _init_classifier(self):
        self.svm = svm.SVC(C = 1e5, probability=True)
    
    def fit_model(self, x_param, y_param):
        self.svm.fit(x_param, y_param)
        
    def run_classifier(self, data):
        log.info("Running SVM Classifier")
        #perform the regression on returns here
        returns_sign = np.sign(data.model.log_return)
        lagged_headers = [header for header in list(data.model) if "lagged" in header]
        #fitting done here
        log.info("Fitting model against specified parameters")
        self.fit_model(data.model[lagged_headers], returns_sign)
        self.run_prediction(data.model[lagged_headers])
        data.model["svm_return"] = abs(data.model.log_return) * data.model.svm_pred
        print("SVM score: {}".format(self.svm.score(data.model[lagged_headers], returns_sign)))
        
    def run_prediction(self, model):
        log.info("Running SVM Prediction")
        data.model["svm_pred"] = self.svm.predict(model)
        


In [5]:
logit = LogisticalRegression()
logit.run_classifier(data)
        
svm = SupportVectorMachine()
svm.run_classifier(data)

20191106 23:50:31 - INFO : Running Logisitical Regression Classifier
20191106 23:50:31 - INFO : Fitting model against specified parameters
20191106 23:50:31 - INFO : Running Logistical Regression Prediction
Logistic Return Prediction Data Summary:  1.0    4942
-1.0      89
Name: log_pred, dtype: int64
Logistic Regression data model:          Date    Open    High     Low  Settle  Volume  log_return  \
6  2000-01-11  6945.0  6981.0  6861.0  6938.5   58944   -0.003094   
7  2000-01-12  6860.0  6946.5  6828.5  6938.0   50030   -0.000072   
8  2000-01-13  6940.0  7145.0  6923.5  7009.0   67693    0.010181   
9  2000-01-14  7053.0  7246.0  7053.0  7229.0   63855    0.030906   
10 2000-01-17  7264.5  7326.0  7146.5  7307.0   47015    0.010732   

    lagged_return_1  lagged_return_2  lagged_return_3  lagged_return_4  \
6          0.018855         0.048677        -0.004065        -0.014892   
7         -0.003094         0.018855         0.048677        -0.004065   
8         -0.000072        -

In [6]:
data.model.head()

Unnamed: 0,Date,Open,High,Low,Settle,Volume,log_return,lagged_return_1,lagged_return_2,lagged_return_3,lagged_return_4,lagged_return_5,log_pred,logistic_return,svm_pred,svm_return
6,2000-01-11,6945.0,6981.0,6861.0,6938.5,58944,-0.003094,0.018855,0.048677,-0.004065,-0.014892,-0.025979,1.0,0.003094,1.0,0.003094
7,2000-01-12,6860.0,6946.5,6828.5,6938.0,50030,-7.2e-05,-0.003094,0.018855,0.048677,-0.004065,-0.014892,1.0,7.2e-05,1.0,7.2e-05
8,2000-01-13,6940.0,7145.0,6923.5,7009.0,67693,0.010181,-7.2e-05,-0.003094,0.018855,0.048677,-0.004065,1.0,0.010181,1.0,0.010181
9,2000-01-14,7053.0,7246.0,7053.0,7229.0,63855,0.030906,0.010181,-7.2e-05,-0.003094,0.018855,0.048677,-1.0,-0.030906,1.0,0.030906
10,2000-01-17,7264.5,7326.0,7146.5,7307.0,47015,0.010732,0.030906,0.010181,-7.2e-05,-0.003094,0.018855,1.0,0.010732,1.0,0.010732


In [7]:
classification_report(logit.y_test, data.model.log_pred)

ValueError: Found input variables with inconsistent numbers of samples: [3522, 5031]