## I. Khai báo thư viện + GPU

## 1. Khai báo thư viện

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# common libraty
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# model library
import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential
import scipy.integrate as integrate
from tensorflow.keras.models import load_model
from keras.callbacks import EarlyStopping
from tensorflow.python.keras import backend as K
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

base_path = '/content/drive/MyDrive/Build_model_stock'

Mounted at /content/drive


## 2. Khai báo GPU

In [2]:
# adjust values to your needs
config = tf.compat.v1.ConfigProto(device_count = {'GPU': 1})
sess = tf.compat.v1.Session(config=config) 
K.set_session(sess)

# II. Function





In [3]:
def split_value(df,posotion,num_back):
  row = pd.DataFrame(data=df[posotion-num_back:posotion].close_price.values).T
  row.columns =[f'x(t-{num_back-i-1})' for i in np.arange(0,num_back)]
  row = row.rename(columns={'x(t-0)': 'x(t)'})
  row['date'] = df.date[posotion-1]
  return row



def _read_data(time, name_stock, num_back):  # time: w-week, m-month, 6m-6month; name_stock: tên loại cổ phiếu; num_back: số bước lùi
  if time =='w':
    df = pd.read_excel(f'{base_path}/SHFE_max_volume_week_month__Mar12.xlsx',sheet_name = f'Fri{name_stock}')[['Date','CLOSEPRICE']]
  elif time =='m':
    df = pd.read_excel(f'{base_path}/SHFE_max_volume_week_month__Mar12.xlsx',sheet_name = f'EOM')[['Date','CLOSEPRICE']]
  elif time =='6m':
    df_m = pd.read_excel(f'{base_path}/SHFE_max_volume_week_month__Mar12.xlsx',sheet_name = f'EOM')[['Date','CLOSEPRICE']]
    list_date_6m = []
    for i in np.arange(-1,-len(df_m),-6): 
      list_date_6m.append(df_m.Date.values[i])
    df = df_m[df_m['Date'].isin(list_date_6m)].copy()
  else:
    print('Không tồn tại data bạn đã đọc')
  df.columns = ['date','close_price']
  df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
  df = df.sort_values(by='date',ascending=True).reset_index(drop=True)

  # Function format data about data table
  df_table = pd.DataFrame()
  for i in np.arange(num_back,len(df)+1,1):
    row_append = split_value(df,i,num_back)
    df_table = df_table.append(row_append)
  df_table = df_table.reset_index(drop=True)
  return df_table


def _scale_value(df_table):
  # Split feature label and label variables:
  feature_label_name = df_table.columns[:-2]
  label_name = df_table.columns[-2:-1]

  x=df_table[feature_label_name].values
  y=df_table[label_name].values

  # Scaler data
  feature_label_scaler=StandardScaler()
  label_scaler=StandardScaler()

  #  Fit data
  feature_label_scaler_fit=feature_label_scaler.fit(x)
  label_scaler_fit=label_scaler.fit(y)

  return feature_label_scaler_fit, label_scaler_fit


def _split_data(df_table, feature_label_scaler_fit, label_scaler_fit):
  
  x = df_table[df_table.columns[:-2]].values
  y = df_table[df_table.columns[-2:-1]].values

  x=feature_label_scaler_fit.transform(x)
  y=label_scaler_fit.transform(y)

  # Split the data into training and testing set
  X_train_val, X_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
  X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
  return  X_train, y_train, X_val, y_val, X_test, y_test

def _build_model(x_train, y_train, x_val, y_val, node_hiddent_layer):
  tf.keras.backend.clear_session()
  ## Create ANN model
  model = Sequential()
  ## Defining the Input layer and FIRST hidden layer, both are same!
  model.add(Dense(units=node_hiddent_layer, input_dim= x_train.shape[1], activation=None))
  ## The output neuron is a single fully connected node 
  model.add(Dense(1, activation=None))
  ## Compiling the model
  model.compile(loss='mae', optimizer='adam')

  # Fit model:
  early_stopping = EarlyStopping(monitor='loss',patience=5)
  loss_val = model.fit(x_train, y_train ,batch_size = 5, epochs = 150, validation_data=(x_val, y_val),callbacks=[early_stopping],verbose=0).history['val_loss'][-1]
  return model

def _predict_values(model, x_test, y_test, feature_label_scaler_fit, label_scaler_fit):

  # Generating Predictions on testing data
  predict_test = model.predict(x_test)

  # Scaling the predicted Price data back to original price scale
  predict_test = label_scaler_fit.inverse_transform(predict_test)
  
  # Scaling the y_test Price data back to original price scale
  y_test_orig = label_scaler_fit.inverse_transform(y_test)
  # Computing the absolute percent error
  APE=100*(abs((y_test_orig-predict_test)/y_test_orig))
  
  return np.mean(APE)

# III. Chạy tìm best params cho model

## 1. Khai báo biến

In [4]:
# node_input_s = [1, 2, 3, 4, 5, 8, 10, 12, 15]
node_input_s = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
name_data = 'day'   # 1, 3, 4, 5, 6, 7, 8, 9, 10, 11
name_time = 'w'  # w, m, 6m

# 2. Chạy

In [7]:
df_result  =pd.DataFrame()
for n_input in node_input_s:
  data_table = _read_data(name_time, name_data, n_input+1)
  
  feature_label_scaler_fit, label_scaler_fit = _scale_value(data_table)
  x_train, y_train, x_val, y_val, x_test, y_test = _split_data(data_table, feature_label_scaler_fit, label_scaler_fit)

  input_shape = x_train.shape[1]
  for node_hiddent_layer in range(x_train.shape[1], x_train.shape[1]*2+1):
    row_data = {}
    
    # create model:
    model = _build_model(x_train, y_train, x_val, y_val,node_hiddent_layer)
    model.save(f'{base_path}/ann_model_history/{name_time}/{name_data}/model_{input_shape}_{node_hiddent_layer}_1.h5')

    mape = _predict_values(model, x_val, y_val, feature_label_scaler_fit, label_scaler_fit)

    row = {'Architecture':str(input_shape)+"-"+str(node_hiddent_layer)+"-"+'1','MAPE':mape}
    # row_data.update(row)
    print(row)
    df_result = df_result.append(row, ignore_index=True)
  print('Done', n_input)
df_result.sort_values(by='MAPE',ascending=True).head()

Output hidden; open in https://colab.research.google.com to view.

# IV. Dự đoán và tính sai số trên tập test

In [8]:
name_time = 'w'
name = 'Fri'
n_input = 6
node_hiddent_layer = 11

df_table = _read_data(name_time,name_data,n_input+1)

feature_label_scaler_fit, label_scaler_fit = _scale_value(df_table)
x_train, y_train, x_val, y_val, x_test, y_test = _split_data(df_table, feature_label_scaler_fit, label_scaler_fit)

input_shape = x_train.shape[1]
# model = _build_model(np.concatenate([x_train, x_val]),  np.concatenate([y_train, y_val]), x_val, y_val,node_hiddent_layer)
model =load_model(f'{base_path}/ann_model_history/{name_time}/{name_data}/model_{input_shape}_{node_hiddent_layer}_1.h5')
mape = _predict_values(model, x_test, y_test, feature_label_scaler_fit, label_scaler_fit)
print(mape)

  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_table.append(row_append)
  df_table = df_

3.4789978805609305


# V. Phân loại mức độ dao động

In [9]:
class ProbaError:
  def __init__(self, 
              price_actual_yesterday=97.555,
              price_predict_today=98.555,
              params_normal=(0.2, 0.005), 
              range_change_rate=[-0.05, 0.03]):
    
  # assign params
      self.price_actual_yesterday = price_actual_yesterday
      self.price_predict_today = price_predict_today
      self.mean_error = params_normal[0]
      self.sigma_error = params_normal[1]
      self.lower_change_rate = range_change_rate[0]
      self.upper_change_rate = range_change_rate[1]

# build distribute normal
  def _buil_normal(self, x, mu, sigma):
      return np.exp(-(x-mu)**2 / (2*sigma**2))/(sigma*np.sqrt(2*np.pi))

# find range error: (price_predict_today) with (price_actual_yesterday in range_change_rate)
  def _find_range_error(self, price_actual_yesterday, price_predict_today, lower_change_rate, upper_change_rate):
  # interval price
      thresold_price_today_1 = price_actual_yesterday * (1 + lower_change_rate)
      thresold_price_today_2 = price_actual_yesterday * (1 + upper_change_rate)

  # interval error:
      thresold_error_today_1 = price_predict_today - thresold_price_today_1
      thresold_error_today_2 = price_predict_today - thresold_price_today_2

      if thresold_error_today_1 < thresold_error_today_2:
          return thresold_error_today_1, thresold_error_today_2

      return thresold_error_today_2, thresold_error_today_1


#  get density probability of distribute normal
  def density_proba_normal(self):
  
      lower_error_today, upper_error_today = self._find_range_error(self.price_actual_yesterday, self.price_predict_today, self.lower_change_rate, self.upper_change_rate)
      val, err = integrate.quad(lambda x : self._buil_normal(x, self.mean_error, self.sigma_error),
                                lower_error_today , upper_error_today)
      return round(val,3)

In [10]:
class PredictPrice:
  def __init__(self,
               df, 
               model, 
              description_model="Dự báo tuần và sai số trên tập test là 2.8%"
            ):
    # params
    self.df = df
    self.model = model
    self.description_model = description_model


    # Split feature label and label variables:
    feature_label_name = df.columns[:-2]
    label_name = df.columns[-2:-1]

    x=df[feature_label_name].values
    y=df[label_name].values

    # Scaler data
    self.feature_label_scaler_fit=StandardScaler().fit(x)
    self.label_scaler_fit=StandardScaler().fit(y)

    # Scaler all data:
    self.x=self.feature_label_scaler_fit.transform(x)
    self.y=self.label_scaler_fit.transform(y)

  # get params for error distribution
  def get_params_error(self):
    
    predict_price_s =self.model.predict(self.x[:-1])
    predict_price_s=self.label_scaler_fit.inverse_transform(predict_price_s)

    # Scaling the y_test Price data back to original price scale
    truth_price_s = self.label_scaler_fit.inverse_transform(self.y[:-1])

    # calculate errors
    error_s = predict_price_s - truth_price_s

    # get params error_s
    mean_error = np.mean(error_s)
    sigma_error = np.sqrt(sum((error_s - mean_error)**2)/len(error_s))

    # return
    return mean_error, sigma_error

  # get pair price
  def get_pair_price(self):
      price_actual_yesterday = self.y[-2:-1]
      price_actual_yesterday = self.label_scaler_fit.inverse_transform(price_actual_yesterday)
      price_predict_today = self.model.predict(self.x[-1:])
      price_predict_today=self.label_scaler_fit.inverse_transform(price_predict_today)

      return price_actual_yesterday, price_predict_today

  # run class
  def run(self):
      mean_error, sigma_error = self.get_params_error()
      price_actual_yesterday, price_predict_today = self.get_pair_price()

      dict_range_error_s = {
            'Nhỏ hơn -5%': [-np.inf, -0.05],
            'Từ -5% đến -3%': [-0.05, -0.03],
            'Từ -3% đến 0%': [-0.03, 0],
            'Từ 0% đến 3%': [0, 0.03],
            'Từ 3% đến 5%': [0.03, 0.05],
            'Lớn hơn 5%': [0.05, np.inf]
            }

      result = {
          'Ghi chú': self.description_model
          }
      for description, range_change_rate in dict_range_error_s.items():
          density_proba = ProbaError(price_actual_yesterday,
                                    price_predict_today,
                                    params_normal=(mean_error, sigma_error), 
                                    range_change_rate=range_change_rate).density_proba_normal()
          row = {description: round(density_proba*100,2)}
          result.update(row)

          df_result = pd.DataFrame(result, index=[0])
          df_result['Date'] = self.df.date.values[-1]

      return df_result

In [11]:
classifi_df = pd.DataFrame()
for i in range(-10,0,1):
    split_data = df_table[:i]
    classifi_point = PredictPrice(split_data, 
                                  model = model, 
                                  description_model="Dự báo tuần và sai số trên tập test là 8%").run()
    classifi_df = classifi_df.append(classifi_point)


# Predict max point 
classifi_point = PredictPrice(df_table, 
                              model = model, 
                              description_model="Dự báo tuần và sai số trên tập test là 2.8%").run()
classifi_df = classifi_df.append(classifi_point)

# Classification trend
classifi_df['Increase'] = classifi_df[['Từ 0% đến 3%', 'Từ 3% đến 5%', 'Lớn hơn 5%',]].sum(axis=1)
classifi_df['Decrease'] = classifi_df[['Nhỏ hơn -5%', 'Từ -5% đến -3%', 'Từ -3% đến 0%']].sum(axis=1)



  classifi_df = classifi_df.append(classifi_point)




  classifi_df = classifi_df.append(classifi_point)
  classifi_df = classifi_df.append(classifi_point)




  classifi_df = classifi_df.append(classifi_point)




  classifi_df = classifi_df.append(classifi_point)
  classifi_df = classifi_df.append(classifi_point)




  classifi_df = classifi_df.append(classifi_point)




  classifi_df = classifi_df.append(classifi_point)




  classifi_df = classifi_df.append(classifi_point)
  classifi_df = classifi_df.append(classifi_point)




  classifi_df = classifi_df.append(classifi_point)


In [12]:

classifi_df[['Date', 'Nhỏ hơn -5%', 'Từ -5% đến -3%', 'Từ -3% đến 0%','Từ 0% đến 3%', 'Từ 3% đến 5%', 'Lớn hơn 5%','Increase','Decrease']]

Unnamed: 0,Date,Nhỏ hơn -5%,Từ -5% đến -3%,Từ -3% đến 0%,Từ 0% đến 3%,Từ 3% đến 5%,Lớn hơn 5%,Increase,Decrease
0,2022-11-25,13.4,7.6,15.1,17.7,11.6,34.6,63.9,36.1
0,2022-12-02,16.1,8.4,16.0,17.8,11.2,30.5,59.5,40.5
0,2022-12-09,18.6,9.3,17.0,18.0,10.8,26.2,55.0,44.9
0,2022-12-16,18.2,9.4,17.3,18.4,11.0,25.7,55.1,44.9
0,2022-12-23,15.9,8.6,16.5,18.3,11.4,29.2,58.9,41.0
0,2022-12-30,16.2,8.3,15.8,17.6,11.1,30.9,59.6,40.3
0,2023-01-06,20.1,9.4,17.0,17.6,10.5,25.4,53.5,46.5
0,2023-01-13,19.5,9.5,17.2,17.9,10.6,25.3,53.8,46.2
0,2023-02-03,15.0,8.3,16.1,18.2,11.5,30.8,60.5,39.4
0,2023-02-10,15.2,8.1,15.7,17.8,11.3,31.8,60.9,39.0
