# Experiments

## Data Processing

In [103]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

class DataProcessor(object):
    def __init__(self, data_path, features, target, feature_type, train_ratio=0.7, test_ratio=0.2, random_state=42):
        self.data_path = data_path
        self.features = features
        self.target = target
        self.feature_type = feature_type
        
        self.train_ratio = train_ratio
        self.test_ratio = test_ratio
        self.val_ratio = 1 - train_ratio - test_ratio
        
        self.random_state = random_state
        self.__load_data__()
    
    def __load_data__(self):
        self.df_raw = pd.read_csv(self.data_path, parse_dates=True, index_col='date')
        self.df_raw = self.df_raw.sort_values(by='date', ascending=True)
        
    def get_data(self):
        self.scaler = preprocessing.StandardScaler()
        
        if self.target == 'trend':
            self.get_target_trend()
        
        X_train, X_test, y_train, y_test = train_test_split(self.df_raw[self.features], 
                                                            self.df_raw[self.target], 
                                                            test_size=self.test_ratio, 
                                                            random_state=self.random_state,
                                                            shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                          y_train, 
                                                          test_size=self.val_ratio, 
                                                          random_state=self.random_state,
                                                          shuffle=False)
        self.scaler.fit(X_train)
        X_train = self.scaler.transform(X_train)
        X_test = self.scaler.transform(X_test)
        
        return X_train, X_val, X_test, y_train, y_val, y_test       
            
    def get_target_trend(self):
        self.df_raw['trend'] = self.df_raw['close'].diff()
        self.df_raw['trend'] = self.df_raw['trend'].apply(lambda x: 1 if x > 0 else 0)
        self.df_raw['trend'] = self.df_raw['trend'].shift(-1, fill_value=1)
    
    def add_technical_indicators(self):
        # moving average
        self.df_raw['ma5'] = self.df_raw['close'].rolling(5).mean()
        # # exponential moving average
        # self.df_raw['ema5'] = self.df_raw['close'].ewm(span=5, adjust=False).mean()
        # # moving average convergence divergence
        # self.df_raw['macd'] = self.df_raw['close'].ewm(span=12, adjust=False).mean() - self.df_raw['close'].ewm(span=26, adjust=False).mean()
        # # relative strength index
        # self.df_raw['rsi'] = self.df_raw['close'].ewm(span=14, adjust=False).mean() / self.df_raw['close'].ewm(span=14, adjust=False).mean()
        # # stochastic oscillator
        # self.df_raw['stoch'] = (self.df_raw['close'] - self.df_raw['low'].rolling(14).min()) / (self.df_raw['high'].rolling(14).max() - self.df_raw['low'].rolling(14).min())
        # # williams %R
        # self.df_raw['williams'] = (self.df_raw['high'].rolling(14).max() - self.df_raw['close']) / (self.df_raw['high'].rolling(14).max() - self.df_raw['low'].rolling(14).min())
    
    def missing_value(self):
        if self.df_raw.isnull().values.any():
            self.df_raw = self.df_raw.dropna()

In [104]:
data_processor = DataProcessor(data_path='./datasets/A_price_data.csv',
                               features=['open', 'high', 'low', 'close', 'volume', 'vwap', 'ma5'],
                               target='trend',
                               feature_type='daily_point')
data_processor.add_technical_indicators()
data_processor.missing_value()
X_train, X_val, X_test, y_train, y_val, y_test = data_processor.get_data()

In [105]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((4038, 7), (449, 7), (1122, 7), (4038,), (449,), (1122,))

In [114]:
list(y_test).count(1.0) / len(list(y_test))

0.5597147950089126

## Experiment 1: Logistic Regression

In [106]:
from sklearn.linear_model import LogisticRegression

In [107]:
lr = LogisticRegression(penalty='l2', tol=0.0001, C=1.0, fit_intercept=True, verbose=10, max_iter=100, random_state=0, solver='lbfgs', n_jobs=1, multi_class='auto', warm_start=False)

In [108]:
lr.fit(X_train, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

 L =  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00
      0.0000D+00  0.0000D+00

X0 =  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00
      0.0000D+00  0.0000D+00

 U =  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00  0.0000D+00
      0.0000D+00  0.0000D+00

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.79893D+03    |proj g|=  8.19104D+01


ITERATION     1

---------------- CAUCHY entered-------------------
 There are            0   breakpoints 

 GCP found in this segment
Piece      1 --f1, f2 at start point  -3.9796D+04  3.9796D+04
Distance to the stationary point =   1.0000D+00
Cauchy X =  
     -8.0446D+01 -8.1910D+01 -7.7618D+01 -7.9972D+01  3.9973D+01 -7.9956D+01
     -7.8739D+01  2.0000D+00

---------------- exit CAUCHY----------------------

           8  variables are fre

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished



At iterate   49    f=  2.79382D+03    |proj g|=  4.38745D-02

 X = -3.2750D-01 -4.5016D-01  5.8006D-01 -4.3594D-02  4.1999D-02  1.8075D-02
      1.4643D-01  1.6202D-03

 G = -3.5085D-02 -3.8892D-02 -2.8948D-02 -3.6401D-02 -4.3875D-02 -3.4884D-02
     -2.9824D-02 -2.7419D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     49     55      1     0     0   4.387D-02   2.794D+03

 X = -3.2750D-01 -4.5016D-01  5.8006D-01 -4.3594D-02  4.1999D-02  1.8075D-02
      1.4643D-01  1.6202D-03
  F =   2793.8234426823992     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


In [109]:
# calculate accuracy
from sklearn.metrics import accuracy_score

y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)



0.5597147950089126

In [111]:
y_pred[:1000]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Experiment 2: XGBoost

In [66]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, y_train)

y_pred = xgb_cl.predict(X_test)

In [67]:
accuracy_score(y_test, y_pred)

0.4835262689225289

In [68]:
y_pred[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Experiment 3: Neural Network 

In [69]:
from sklearn.neural_network import MLPClassifier

In [86]:
mlp_clf = MLPClassifier(random_state=1, max_iter=10000, learning_rate_init=0.000001, verbose=3, tol=0.00000001).fit(X_train, y_train)

Iteration 1, loss = 0.70825623
Iteration 2, loss = 0.70820678
Iteration 3, loss = 0.70815769
Iteration 4, loss = 0.70810908
Iteration 5, loss = 0.70805787
Iteration 6, loss = 0.70800589
Iteration 7, loss = 0.70795551
Iteration 8, loss = 0.70790495
Iteration 9, loss = 0.70785538
Iteration 10, loss = 0.70780466
Iteration 11, loss = 0.70775703
Iteration 12, loss = 0.70771103
Iteration 13, loss = 0.70766388
Iteration 14, loss = 0.70761368
Iteration 15, loss = 0.70756539
Iteration 16, loss = 0.70751563
Iteration 17, loss = 0.70746957
Iteration 18, loss = 0.70742193
Iteration 19, loss = 0.70737621
Iteration 20, loss = 0.70732992
Iteration 21, loss = 0.70728269
Iteration 22, loss = 0.70723240
Iteration 23, loss = 0.70718617
Iteration 24, loss = 0.70713762
Iteration 25, loss = 0.70709141
Iteration 26, loss = 0.70704263
Iteration 27, loss = 0.70699279
Iteration 28, loss = 0.70694395
Iteration 29, loss = 0.70689784
Iteration 30, loss = 0.70685200
Iteration 31, loss = 0.70680528
Iteration 32, los



In [87]:
pred = mlp_clf.predict(X_test)



In [88]:
mlp_clf.score(X_test, y_test)



0.4835262689225289

In [89]:
pred[:100]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])