In [None]:
!mkdir data

In [None]:
!gsutil cp gs://mm_dataflow_bucket/inputs/technical_indicators_dataset.csv data
    
!ls data

<h3> Useful function </h3>

In [None]:
!pip install pandas-datareader

In [1]:
from datetime import date
from google.cloud import storage
from googleapiclient.discovery import build
from apiclient.http import MediaIoBaseDownload
import pandas as pd
import numpy as np

<h3> Useful Functions </h3>

In [2]:
def read_from_bucket(filename, bucket_name):
  gcs_service = build('storage', 'v1')
  # The name for the new bucket
  holder = 'data/{}'.format(filename.split('/')[-1])
  print('Reading {} from {}'.format(filename,bucket_name))
  with open(holder, 'wb') as f:
    request = gcs_service.objects().get_media(bucket=bucket_name,
                                              object=filename)
    media = MediaIoBaseDownload(f, request)

    done = False
    while not done:
      # _ is a placeholder for a progress object that we ignore.
      # (Our file is small, so we skip reporting progress.)
      _, done = media.next_chunk()

  print('Download complete')
  return pd.read_csv(holder)



In [3]:


  
def get_price_var(symbol):
    '''
    Get historical price data for a given symbol leveraging the power of pandas_datareader and Yahoo.
    Compute the difference between first and last available time-steps in terms of Adjusted Close price..
    Input: ticker symbol
    Output: price variation 
    '''
    # read data
    prices = dr.get_data_yahoo(symbol, '2019-01-01', '2020-03-01')['Adj Close']

    # get all timestamps for specific lookups
    today = prices.index[-1]
    start = prices.index[0]

    # calculate percentage price variation
    price_var = ((prices[today] - prices[start]) / prices[start]) * 100
    return price_var

def read_from_bucket(filename, bucket_name):
  gcs_service = build('storage', 'v1')
  # The name for the new bucket
  holder = 'data/{}'.format(filename.split('/')[-1])
  print('Reading {} from {}'.format(filename,bucket_name))
  with open(holder, 'wb') as f:
    request = gcs_service.objects().get_media(bucket=bucket_name,
                                              object=filename)
    media = MediaIoBaseDownload(f, request)

    done = False
    while not done:
      # _ is a placeholder for a progress object that we ignore.
      # (Our file is small, so we skip reporting progress.)
      _, done = media.next_chunk()

  print('Download complete')
  return pd.read_csv(holder)

def get_latest_price_yahoo(symbol, as_of_date=date.today()):
  try:#
    print('--latest price for{}'.format(symbol))
    res = dr.get_data_yahoo(symbol, as_of_date, as_of_date)[['Close']]
    res['Symbol'] = symbol
    return res.tail(1)
  except Exception as e :

    return pd.DataFrame(columns=[symbol])





<h3> Scaling data </h3>

In [4]:
!pip install xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
# Machine learning (preprocessing, models, evaluation)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd

def create_train_and_test_split(dataset, label):
  print('Creating Train and Test Split where label is :{}'.format(label))
  train_split, test_split = train_test_split(dataset, test_size=0.2, random_state=1, stratify=dataset[label])
  X_train = train_split.drop([label], axis=1).values
  y_train = train_split[label].values
  X_test = test_split.drop([label], axis=1).values
  y_test = test_split[label].values
  print()
  print(f'Number of training samples: {X_train.shape[0]}')
  print()
  print(f'Number of testing samples: {X_test.shape[0]}')
  print()
  print(f'Number of features: {X_train.shape[1]}')
  return X_train, y_train, X_test, y_test, test_split





In [5]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

def create_algo(classifier):
  steps = [('scaler', StandardScaler()), 
           #('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
           ('algo', classifier)]
  pipeline = Pipeline(steps) # define the pipeline object.
  return pipeline


def create_gridsearch(classifier, tuned_parameters, randomized=False):
  print('Creating Gridsearch for {}'.format(classifier))
  ppln = create_algo(classifier)

  input_params = tuned_parameters if isinstance(tuned_parameters, dict) else tuned_parameters[0]
  new_params = dict(('algo__{}'.format(k), v) for k,v in input_params.items())
  if randomized:
    print('Creating randomized/')
    return RandomizedSearchCV(ppln,
                      [new_params],
                      n_jobs=4,
                      scoring='precision_weighted',
                      cv=3)
  return GridSearchCV(ppln,
                      [new_params],
                      n_jobs=4,
                      scoring='precision_weighted',
                      cv=3)


<h3> Testing Various Algorithms </h3>

In [6]:
def run_with_SVM2(x, y, randomized):
  print('Running SVM')
  from sklearn.svm import SVC
  
  tuned_parameters = [{'kernel': ['rbf', 'linear', 'poly'],
                      'gamma': [1e-3, 1e-4],
                     'C': [0.01, 0.1, 1, 10, 100]}]
  print('finding best grid search')
  clf1 = create_gridsearch(SVC(random_state=1),tuned_parameters, randomized)
  clf1.fit(x, y)
  
  
  bs,bp = clf1.best_score_, clf1.best_params_
  
  #print('Best score and parameters found on development set:')
  #print()
  #print('%0.3f for %r' % (bs, bp))
  #print()
  return bs, bp, clf1

from sklearn.ensemble import RandomForestClassifier
def run_with_random_forest(x, y):
  # Parameter grid to be tuned
  tuned_parameters = {'n_estimators': [32, 256],
                      'max_features': ['auto', 'sqrt'],
                      'max_depth': [4, 5, 6],
                      'criterion': ['gini', 'entropy']}
  clf2 = create_gridsearch(RandomForestClassifier(random_state=1),
                      tuned_parameters)
  clf2.fit(x, y)
  
  bs,bp = clf2.best_score_, clf2.best_params_
  
  #print('Best score and parameters found on development set:')
  #print()
  #print('%0.3f for %r' % (bs, bp))
  #print()
  return bs, bp, clf2
  
  
def run_with_XGBClassifer(x, y):
  # Parameter grid to be tuned
  tuned_parameters = {'learning_rate': [0.01, 0.001],
                      'max_depth': [4, 5, 6],
                      'n_estimators': [32, 128]}

  #clf3 = GridSearchCV(xgb.XGBClassifier(random_state=1),
  #                  tuned_parameters,
  #                  n_jobs=6,
  #                  scoring='precision_weighted', 
  #                  cv=5)
  clf3 = create_gridsearch(xgb.XGBClassifier(random_state=1),
                      tuned_parameters)
  
  clf3.fit(x, y)
  bs,bp = clf3.best_score_, clf3.best_params_
  
  return bs, bp, clf3

def run_with_mpl(x, y):
  tuned_parameters = {'hidden_layer_sizes': [(32,), (64,)],
                    'activation': ['tanh', 'relu'],
                    'solver': ['lbfgs', 'adam']}
  #clf4 = GridSearchCV(MLPClassifier(random_state=1, batch_size=4, early_stopping=True), 
  #                    tuned_parameters,
  #                    n_jobs=6,
  #                    scoring='precision_weighted',
  #                    cv=5)
  clf4 = create_gridsearch(MLPClassifier(random_state=1, batch_size=4, early_stopping=True), 
                      tuned_parameters)
  
  clf4.fit(x, y)
  bs,bp = clf4.best_score_, clf4.best_params_
  
  #print('Best score, and parameters, found on development set:')
  #print()
  #print('%0.3f for %r' % (bs, bp))
  #print()
  return bs, bp, clf4
  




In [8]:
from collections import OrderedDict
def run_ml_algorithms(features, labels):
  mps, _ , clf2= run_with_mpl(features, labels)#
  rbs, _, clf1 = run_with_random_forest(features, labels)
  xbs,_, clf3 = run_with_XGBClassifer(features, labels)

  return [('MPL', clf2,mps), ('RF', clf1, rbs), ('XGB', clf3, xbs)]
  #print('Random Forest:{}, MPLClassifier:{} , XGB:{}'.format(rbs, mps, xbs))
  #return [('RandomForest',rbs, clf1), ('MPL CLASSIFIER', mps, clf2), ('XGBClassifier',xbs, clf3)]

<h3> Now Testing few algorithms FOR TECHNICAL INDICATORS </h3>

In [9]:
def get_dataset():
    !gsutil cp gs://mm_dataflow_bucket/inputs/tech_cyclical.csv . data
    dataset = pd.read_csv('data/tech_cyclical.csv')\
               .dropna().drop('Date', axis=1).drop('ticker', axis=1)
    print('Columns are:{}'.format(dataset.columns))
    return dataset

In [10]:
tech_indic = get_dataset()# Too big. We need to bring it to  10k - 20k samples if we want to use this
tech_indic.columns
tech_indic['increase_15'] = tech_indic['result'] > 1.15
print('True:{}'.format(tech_indic[tech_indic['increase_15'] == True].shape))
print('False:{}'.format(tech_indic[tech_indic['increase_15'] == False].shape))
tech_indic = tech_indic.drop('result', axis=1)

Copying gs://mm_dataflow_bucket/inputs/tech_cyclical.csv...
Omitting directory "file://.". (Did you mean to do cp -r?)                      

Operation completed over 1 objects/124.7 MiB.                                    
Columns are:Index(['high', 'low', 'open', 'close', 'volume', 'adj_close', 'SMA_10',
       'SMA_20', 'SMA_50', 'SMA_100', 'SMA_200', 'ema_10', 'ema_20', 'ema_50',
       'ema_100', 'ema_200', 'ATR', 'ADX', 'CCI', 'ROC', 'rsi', 'Williams % R',
       'SO%K', 'result'],
      dtype='object')
True:(34595, 25)
False:(248415, 25)


In [None]:

X_train, y_train, X_test, y_test, test_split = create_train_and_test_split(tech_indic, 'increase_15')


In [None]:
tech_indic.head(10)

In [None]:
def run_all_algorithms(X, y):
    algo_list = run_ml_algorithms(X, y)
    for nm, clf,scr in algo_list:
        print('{} scored:{}'.format(nm, scr))

<h3> Training on tech indic </h3>

In [None]:
run_all_algorithms(X_train, y_train)

<h3> Running on Test </h3>

<h3> Creatin gTensor flow </h3>

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [25]:
tech_indic = tech_indic.rename({'Williams % R':'Williams','SO%K':'SOK' }, axis=1)
tech_indic.columns

Index(['high', 'low', 'open', 'close', 'volume', 'adj_close', 'SMA_10',
       'SMA_20', 'SMA_50', 'SMA_100', 'SMA_200', 'ema_10', 'ema_20', 'ema_50',
       'ema_100', 'ema_200', 'ATR', 'ADX', 'CCI', 'ROC', 'rsi', 'Williams',
       'SOK', 'increase_15'],
      dtype='object')

<h3>Creating Train Validation and Split </h3>

In [26]:
train, test = train_test_split(tech_indic, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

181126 train examples
45282 validation examples
56602 test examples


In [27]:
def df_to_dataset(dataframe, label, shuffle=True, batch_size=32):
  print('Crating dataset for label :{}'.format(label))
  dataframe = dataframe.copy()
  labels = dataframe.pop(label)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [28]:
batch_size = 50 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, 'increase_15', batch_size=batch_size)
val_ds = df_to_dataset(val, 'increase_15',shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, 'increase_15',shuffle=False, batch_size=batch_size)

Crating dataset for label :increase_15
Crating dataset for label :increase_15
Crating dataset for label :increase_15


<h3> Create a Feature Layer </h3>

In [29]:
def create_feature_layer(label_col,dataframe):
    print('Creating feture layer for {}'.format(label_col))
    valid_feats = [c for c in  dataframe.columns.values.tolist() if label_col not in c] 
    feature_columns = []
    for f in valid_feats:
        feature_columns.append(feature_column.numeric_column(f))
    return  tf.keras.layers.DenseFeatures(feature_columns)
feature_layer = create_feature_layer('increase_15', tech_indic)

Creating feture layer for increase_15


In [30]:
batch_size = 32
train_ds = df_to_dataset(train,'increase_15', batch_size=batch_size)
val_ds = df_to_dataset(val, 'increase_15',shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test,'increase_15', shuffle=False, batch_size=batch_size)
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=20)


Crating dataset for label :increase_15
Crating dataset for label :increase_15
Crating dataset for label :increase_15
Train for 5661 steps, validate for 1416 steps
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ff0a1463410>

<h3>Evaluating</h3>

In [33]:
model.evaluate(test_ds)



[0.3896771559629688, 0.87783116]

<h3> Predict </h3>

In [35]:
#1 Load the Data into dataframe

#2 Call DF to Dataset

#3 Call model.predict
print("Generate predictions for 3 samples")
model.predict(x_test[:3])

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

print("predictions shape:", predictions.shape)

tensorflow.python.data.ops.dataset_ops.BatchDataset

<h3>Reading Quartely Reports </h3>

In [None]:
import pandas as pd
!gsutil cp gs://mm_dataflow_bucket/inputs/top10_corr_df_1008.pkl  .
!gsutil cp gs://mm_dataflow_bucket/inputs/top10_df_classifier_1008.pkl .

df = pd.read_pickle('top10_corr_df_1008.pkl')
df = df.fillna(0.0)
print('Shape is:{}'.format(df.shape))
print('Columns are:{}'.format(df.columns))

<h3> Running Technical indicators Algs </h3>

In [None]:
X_train, y_train, X_test, y_test, test_split = create_train_and_test_split(df, 'Decision')



<h3> Performance on Training. We need to sort out data. we only get 50% performance. </h3>

In [None]:
run_ml_algorithms(X_train, y_train)

<h3> Performance on test </h3>

In [None]:
run_ml_algorithms(X_test, y_test)

<p>Not reliable as it's to accurate. Let's attempt for example to predict if price is > 5%. Also we can try to widen the dataset by searching all shares </p>

<h3> Evlauatin gPerformance </h3>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
prices_df = pd.read_csv('data/prices.csv')
prices_df = prices_df.rename(columns={"Unnamed: 0": "ticker"})
tickers = prices_df['ticker'].values

In [None]:
def evaluate(clsfier_tpl, x_t, y_t):
    nm, clsf, t_score = clsfier_tpl
    y_pred = clsf.predict(x_t)
    a_s = accuracy_score(y_t, y_pred)
    print('{} has accuracy {} on test and {} on train'.format(nm, a_s, t_score) )
    print('--------------------------')
    print(print(classification_report(y_t, y_pred)))

In [None]:
for clsf_tpl in algo_list:
    evaluate(clsf_tpl,X_test, y_test )

<h3> TODO. Pass current technical indicator for a share and see what is the prediction ... </h3>

<h3> Checking how good classifier are on real prediction </h3>

In [None]:
pvar_test = prices_df.loc[test_split.index.values, :]
buy_amount = 100

clf2, clf3, clf4 = algo_list


In [None]:
# Initial investment can be $100 for each stock whose predicted class = 1
buy_amount = 100
# In new dataframe df1, store all the information regarding each model's predicted class and relative gain/loss in $USD
df1 = pd.DataFrame(y_test, index=test_split.index.values, columns=['ACTUAL']) # first column is the true class (BUY/INGORE)
df1['RF'] = clf2[1].predict(X_test)
df1['VALUE START RF [$]'] = df1['RF'] * buy_amount
df1['VAR RF [$]'] = (pvar_test['2019 PRICE VAR [%]'].values / 100) * df1['VALUE START RF [$]']
df1['VALUE END RF [$]'] = df1['VALUE START RF [$]'] + df1['VAR RF [$]']
df1['XGB'] = clf4[1].predict(X_test)
df1['VALUE START XGB [$]'] = df1['XGB'] * buy_amount
df1['VAR XGB [$]'] = (pvar_test['2019 PRICE VAR [%]'].values / 100) * df1['VALUE START XGB [$]']
df1['VALUE END XGB [$]'] = df1['VALUE START XGB [$]'] + df1['VAR XGB [$]']
df1['MLP'] = clf3[1].predict(X_test)
df1['VALUE START MLP [$]'] = df1['MLP'] * buy_amount
df1['VAR MLP [$]'] = (pvar_test['2019 PRICE VAR [%]'].values / 100) * df1['VALUE START MLP [$]']
df1['VALUE END MLP [$]'] = df1['VALUE START MLP [$]'] + df1['VAR MLP [$]']


<h3> Evaluate Performance </h3>

In [None]:
start_value_rf = df1['VALUE START RF [$]'].sum()
final_value_rf = df1['VALUE END RF [$]'].sum()
net_gain_rf = final_value_rf - start_value_rf
percent_gain_rf = (net_gain_rf / start_value_rf) * 100
start_value_xgb = df1['VALUE START XGB [$]'].sum()
final_value_xgb = df1['VALUE END XGB [$]'].sum()
net_gain_xgb = final_value_xgb - start_value_xgb
percent_gain_xgb = (net_gain_xgb / start_value_xgb) * 100
start_value_mlp = df1['VALUE START MLP [$]'].sum()
final_value_mlp = df1['VALUE END MLP [$]'].sum()
net_gain_mlp = final_value_mlp - start_value_mlp
percent_gain_mlp = (net_gain_mlp / start_value_mlp) * 100
percent_gain_sp500 = get_price_var('^GSPC') # get percent gain of S&P500 index
percent_gain_dj = get_price_var('^DJI') # get percent gain of DOW JONES index


MODELS_COMPARISON = pd.DataFrame([start_value_rf, final_value_rf, net_gain_rf, percent_gain_rf],
                    index=['INITIAL COST [USD]', 'FINAL VALUE [USD]', '[USD] GAIN/LOSS', 'ROI'], columns=['RF'])

MODELS_COMPARISON['XGB'] = [start_value_xgb, final_value_xgb, net_gain_xgb, percent_gain_xgb]
MODELS_COMPARISON['MLP'] = [start_value_mlp, final_value_mlp, net_gain_mlp, percent_gain_mlp]
MODELS_COMPARISON['S&P 500'] = ['', '', '', percent_gain_sp500]
MODELS_COMPARISON['DOW JONES'] = ['', '', '', percent_gain_dj]

In [None]:
MODELS_COMPARISON