In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.ensemble import RandomForestRegressor
from keras.models import Sequential
from keras.layers import Dense
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV
from datetime import datetime, timedelta


In [None]:
data = pd.read_csv('dataset.csv')

In [None]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,12/29/2014,8214.700195,8279.150391,8214.700195,8246.299805,8246.299805,82100.0
1,12/30/2014,8260.299805,8268.25,8220.549805,8248.25,8248.25,77700.0
2,12/31/2014,8243.900391,8291.0,8243.75,8282.700195,8282.700195,84500.0
3,1/1/2015,,,,,,
4,1/2/2015,8288.700195,8410.599609,8288.700195,8395.450195,8395.450195,101900.0


In [None]:
data.shape

(1377, 7)

# Handling missing values

In [None]:
start_date = datetime(2014, 12, 29)
end_date = datetime(2020, 7, 31)
date_range = pd.date_range(start=start_date, end=end_date)

# Lọc các ngày không phải là thứ 7 hoặc Chủ nhật
filtered_dates = [date for date in date_range if date.weekday() < 5]  # 0-4: Monday-Friday

# Tạo DataFrame từ danh sách các ngày đã lọc
full_date = pd.DataFrame({'Date': filtered_dates})
data['Date'] = data['Date'].astype('datetime64[ns]')

# Left join
data = pd.merge(full_date,data,on='Date',how='left')

In [None]:
data.shape

(1460, 7)

In [None]:
data.isnull().sum()

Date          0
Open         91
High         91
Low          91
Close        91
Adj Close    91
Volume       91
dtype: int64

In [None]:
data.fillna(data.mean(), inplace=True)

In [None]:
df = data.copy()

In [None]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2014-12-29,8214.700195,8279.150391,8214.700195,8246.299805,8246.299805,82100.0
1,2014-12-30,8260.299805,8268.25,8220.549805,8248.25,8248.25,77700.0
2,2014-12-31,8243.900391,8291.0,8243.75,8282.700195,8282.700195,84500.0
3,2015-01-01,9719.67988,9765.624733,9655.095867,9710.01206,9710.01206,303269.028488
4,2015-01-02,8288.700195,8410.599609,8288.700195,8395.450195,8395.450195,101900.0


### Feature engineering



In [None]:
df = df.drop(['Adj Close'],axis=1)

In [None]:
df['Range'] = df['High'] - df['Low']

In [None]:
df['Open'].max()

12430.5

In [None]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Range
0,2014-12-29,8214.700195,8279.150391,8214.700195,8246.299805,82100.0,64.450196
1,2014-12-30,8260.299805,8268.25,8220.549805,8248.25,77700.0,47.700195
2,2014-12-31,8243.900391,8291.0,8243.75,8282.700195,84500.0,47.25
3,2015-01-01,9719.67988,9765.624733,9655.095867,9710.01206,303269.028488,110.528866
4,2015-01-02,8288.700195,8410.599609,8288.700195,8395.450195,101900.0,121.899414


### Data normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df_scaled = df.copy()
columns_to_scale = [col for col in df.columns if col != 'Date']
for col in columns_to_scale:
  df_scaled[col] =(df_scaled[col] - df_scaled[col].min()) / (df_scaled[col].max() - df_scaled[col].min())

In [None]:
df_scaled.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Range
0,2014-12-29,0.220285,0.230704,0.25273,0.236604,0.045334,0.025042
1,2014-12-30,0.228719,0.228684,0.253794,0.236966,0.042904,0.014436
2,2014-12-31,0.225686,0.2329,0.258016,0.243356,0.046659,0.014151
3,2015-01-01,0.498632,0.506166,0.514829,0.508079,0.167459,0.054218
4,2015-01-02,0.233972,0.255064,0.266195,0.264267,0.056267,0.061417


### Data Spliting

In [None]:
training_start_date = '2014-12-29'
training_end_date = '2018-12-28'
testing_start_date = '2018-12-31'
testing_end_date = '2020-07-31'

train_data = df_scaled[(df_scaled['Date'] >= training_start_date) & (df_scaled['Date'] <= training_end_date)]
test_data = df_scaled[(df_scaled['Date'] >= testing_start_date) & (df_scaled['Date'] <= testing_end_date)]

In [None]:
train_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Range
0,2014-12-29,0.220285,0.230704,0.25273,0.236604,0.045334,0.025042
1,2014-12-30,0.228719,0.228684,0.253794,0.236966,0.042904,0.014436
2,2014-12-31,0.225686,0.2329,0.258016,0.243356,0.046659,0.014151
3,2015-01-01,0.498632,0.506166,0.514829,0.508079,0.167459,0.054218
4,2015-01-02,0.233972,0.255064,0.266195,0.264267,0.056267,0.061417
