In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
import statsmodels.api as sm

In [2]:
cof_df = pd.read_excel("coffee_data.xlsx")

In [3]:
cof_df

Unnamed: 0,Tên_mặt_hàng,Đơn_vị_tính,Loại_giá,Loại_tiền,Ngày,Giá
0,Cà phê Robusta nhân xô,VNĐ/kg,Thu mua,VNĐ,2021-01-01,31667.5
1,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2021-01-04,32717.0
2,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2021-01-05,32417.0
3,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2021-01-06,32317.0
4,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2021-01-07,32117.0
...,...,...,...,...,...,...
619,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2023-05-15,55566.5
620,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2023-05-16,56066.5
621,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2023-05-17,56666.5
622,Cà phê Robusta nhân xô,Vnđ/kg,Thu mua,VNĐ,2023-05-18,57000.0


In [4]:
# Convert 'Ngày' column to datetime type
cof_df['Ngày'] = pd.to_datetime(cof_df['Ngày'])
# Sort the DataFrame by date in ascending order
cof_df.sort_values(by='Ngày', inplace=True)
cof_df['Giá Cà Phê'] =  cof_df['Giá']

cof_df = cof_df.drop(["Loại_tiền", "Loại_giá", "Tên_mặt_hàng", "Đơn_vị_tính", "Giá"], axis = 1)
cof_df

Unnamed: 0,Ngày,Giá Cà Phê
0,2021-01-01,31667.5
1,2021-01-04,32717.0
2,2021-01-05,32417.0
3,2021-01-06,32317.0
4,2021-01-07,32117.0
...,...,...
619,2023-05-15,55566.5
620,2023-05-16,56066.5
621,2023-05-17,56666.5
622,2023-05-18,57000.0


In [5]:
gas_df = pd.read_excel("fuel_data.xlsx")
gas_df

Unnamed: 0,Giá nhiên liệu,Xăng RON 95 1 lít,Xăng E5 RON 92 1 lít,Dầu Diesel 1 lít,Dầu Hỏa 1 lít
0,12.06.2023,22.01,20.87,18.02,17.82
1,01.06.2023,22.01,20.87,17.94,17.77
2,22.05.2023,21.49,20.48,17.95,17.96
3,11.05.2023,21.00,20.13,17.65,17.97
4,04.05.2023,22.32,21.43,18.25,18.52
...,...,...,...,...,...
114,02.03.2019,18.54,17.21,15.86,14.88
115,15.02.2019,17.60,16.27,14.90,14.18
116,31.01.2019,17.60,16.27,14.90,14.18
117,16.01.2019,17.60,16.27,14.90,14.18


In [6]:
gas_df["Ngày"] = pd.to_datetime(gas_df["Giá nhiên liệu"], format="%d.%m.%Y")
gas_df.drop(gas_df.columns[[0,1,2,4]], axis='columns', inplace=True)
gas_df

Unnamed: 0,Dầu Diesel 1 lít,Ngày
0,18.02,2023-06-12
1,17.94,2023-06-01
2,17.95,2023-05-22
3,17.65,2023-05-11
4,18.25,2023-05-04
...,...,...
114,15.86,2019-03-02
115,14.90,2019-02-15
116,14.90,2019-01-31
117,14.90,2019-01-16


In [7]:
# Merge the dataframes with an outer join
merged_df = pd.merge(cof_df, gas_df, left_on='Ngày', right_on='Ngày', how="outer")

# Fill missing dates from both datasets with each other
merged_df.fillna(method='bfill', inplace=True)

In [8]:
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,Ngày,Giá Cà Phê,Dầu Diesel 1 lít
0,2021-01-01,31667.5,12.64
1,2021-01-04,32717.0,12.64
2,2021-01-05,32417.0,12.64
3,2021-01-06,32317.0,12.64
4,2021-01-07,32117.0,12.64
...,...,...,...
619,2023-05-15,55566.5,18.02
620,2023-05-16,56066.5,18.02
621,2023-05-17,56666.5,18.02
622,2023-05-18,57000.0,18.02


In [9]:
data = merged_df

In [10]:
# scaler = MinMaxScaler()
# df[["Giá Cà Phê", "Dầu Diesel 1 lít"]] = scaler.fit_transform(df[["Giá Cà Phê", "Dầu Diesel 1 lít"]])

In [11]:
data

Unnamed: 0,Ngày,Giá Cà Phê,Dầu Diesel 1 lít
0,2021-01-01,31667.5,12.64
1,2021-01-04,32717.0,12.64
2,2021-01-05,32417.0,12.64
3,2021-01-06,32317.0,12.64
4,2021-01-07,32117.0,12.64
...,...,...,...
619,2023-05-15,55566.5,18.02
620,2023-05-16,56066.5,18.02
621,2023-05-17,56666.5,18.02
622,2023-05-18,57000.0,18.02


In [12]:
# drop date column
data.drop(data.columns[[0]], axis=1, inplace=True)
data.dropna()

Unnamed: 0,Giá Cà Phê,Dầu Diesel 1 lít
0,31667.5,12.64
1,32717.0,12.64
2,32417.0,12.64
3,32317.0,12.64
4,32117.0,12.64
...,...,...
619,55566.5,18.02
620,56066.5,18.02
621,56666.5,18.02
622,57000.0,18.02


In [13]:
df = merged_df
# Step 3: Feature Extraction with Random Forest (RF)
X_rf = df[['Dầu Diesel 1 lít']]
y_rf = df['Giá Cà Phê']

rf_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_rf, y_rf)

# Extract the feature from RF model
extracted_feature = rf_model.predict(X_rf)

# Step 4: Data Preparation for RNN
# Combine the diesel price and extracted feature for RNN input
X_rnn = np.column_stack((X_rf, extracted_feature))
y_rnn = y_rf.values

# Normalize the features (optional but often beneficial for RNNs)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_rnn = scaler.fit_transform(X_rnn)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_rnn, y_rnn, test_size=0.2, random_state=42)

# Step 5: Build and Train the RNN Model
rnn_model = Sequential()
rnn_model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
rnn_model.add(Dense(1))
rnn_model.compile(optimizer='adam', loss='mean_squared_error')

# Reshape X_train and X_test to (samples, timesteps, features) for LSTM input
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

rnn_model.fit(X_train, y_train, epochs=100, batch_size=16)

# Step 6: Evaluate the RNN Model
y_pred_rnn = rnn_model.predict(X_test)
mse_rnn = mean_squared_error(y_test, y_pred_rnn)
print(f'RNN Mean Squared Error: {mse_rnn}')

# Step 7: SARIMA Modeling
y_sarima = df['Giá Cà Phê']
sarima_model = sm.tsa.SARIMA(y_sarima, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
sarima_model_fit = sarima_model.fit()

# Step 8: Make Predictions with SARIMA
# Assuming you have new data with diesel price and you want to predict the coffee price
new_diesel_price = 18.02
new_feature = rf_model.predict(np.array([[new_diesel_price]]))

# Combine the new diesel price and new feature for RNN input
new_input = np.column_stack((new_diesel_price, new_feature))
new_input = scaler.transform(new_input)

# Reshape new_input to (samples, timesteps, features) for LSTM input
new_input = new_input.reshape((1, new_input.shape[0], 1))

# Predict using RNN
predicted_coffee_price_rnn = rnn_model.predict(new_input)[0][0]

# Predict using SARIMA
predicted_coffee_price_sarima = sarima_model_fit.forecast(steps=1)[0]

print(f'Predicted Coffee Price (RNN): {predicted_coffee_price_rnn}')
print(f'Predicted Coffee Price (SARIMA): {predicted_coffee_price_sarima}')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
RNN Mean Squared Error: 33745574.65636951


AttributeError: module 'statsmodels.tsa.api' has no attribute 'SARIMA'

# ---------------------------------------------------------------------------------

In [None]:


# X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2, random_state=42)

# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train_rf)

In [None]:
# # Step 4: Train the Random Forest Model for feature extraction
# rf_model = RandomForestRegressor(random_state=42)
# rf_model.fit(X_train_scaled, y_train_rf)

# # Step 5: Extract the Feature from Random Forest Model
# extracted_feature_train = rf_model.predict(X_train_scaled)
# extracted_feature_test = rf_model.predict(X_test_rf)

# # Step 6: Data Preparation for RNN
# # Combine the extracted feature with 'Dầu Diesel 1 lít' as input for RNN
# X_train_rnn = np.column_stack((X_train_scaled, extracted_feature_train))
# X_test_rnn = np.column_stack((X_test_rf, extracted_feature_test))

# # Reshape X_train_rnn and X_test_rnn to (samples, timesteps, features) for LSTM input
# X_train_rnn = X_train_rnn.reshape((X_train_rnn.shape[0], 2, 1))
# X_test_rnn = X_test_rnn.reshape((X_test_rnn.shape[0], 2, 1))

In [None]:
# # Step 7: Build and Train the RNN Model
# rnn_model = Sequential()
# rnn_model.add(LSTM(50, activation='relu', input_shape=(2, 1)))
# rnn_model.add(Dense(1))
# rnn_model.compile(optimizer='adam', loss='mean_squared_error')

# rnn_model.fit(X_train_rnn, y_train_rf, epochs=100, batch_size=16)


In [None]:
# # Step 8: Evaluate the RNN Model
# y_pred_rnn = rnn_model.predict(X_test_rnn)
# mse_rnn = mean_squared_error(y_test_rf, y_pred_rnn)
# print(f'RNN Model Mean Squared Error: {mse_rnn}')


In [None]:
# # Step 9: Make Predictions
# # Assuming you have new gas price data
# new_gas_price = 18.02
# new_feature = rf_model.predict(np.array([[new_gas_price]]))

# # Reshape new_feature to match the shape of the extracted_feature from training data
# new_feature = new_feature.reshape((-1, 1))

# # Concatenate the new_gas_price and new_feature as input to RNN
# new_input_rnn = np.concatenate((np.array([[new_gas_price]]), new_feature), axis=1)

# # Reshape new_input_rnn to (samples, timesteps, features) for LSTM input
# new_input_rnn = new_input_rnn.reshape((1, 2, 1))

# predicted_vegetable_price = rnn_model.predict(new_input_rnn)[0][0]
# print(f'Predicted Vegetable Price: {predicted_vegetable_price}')