# <center>Stock Price Fluctation</center>

# Phân tích yêu cầu của công ty
## Yêu cầu của công ty
Hãy sử dụng dữ liệu được cung cấp trong email này (dữ liệu giá và khối lượng của một vài mã cổ phiếu) để xây dựng một số mô hình dự đoán biến động giá cổ phiếu.
**Biến động giá cổ phiếu = Giá cổ phiếu N (phút hoặc giờ hoặc ngày) sau - Giá cổ phiếu hiện tại.**
Công ty cung cấp 4 file dữ liệu từ các công ty FPT, MSN, PNJ và VIC bao gồm các trường
- Open: Giá mở bán cổ phiếu
- High: Giá cao nhất của cổ phiếu trong khoảng thời gian đó.
- Low: Giá thấp nhất trong khoảng thời gian đó.
- Close: Giá đóng
- Volume: Khối lượng giao dịch
# Phân tích
## Xác định target value
$StockPriceFluctuation = Close_{t+1} - Close_{t}$

# Import library

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# read data

In [29]:

def load_and_process_stock_data(file_paths, N):
    dataframes = []

    for file_path in file_paths:
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Ensure the dataframe has the required columns
        required_columns = ['Ticker', 'Date/Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Open Interest']
        if not all(col in df.columns for col in required_columns):
            print(f"Warning: {file_path} is missing some required columns. Skipping this file.")
            continue

        # Convert Date/Time to datetime
        df['Date/Time'] = pd.to_datetime(df['Date/Time'], format='%m/%d/%Y %H:%M')

        # Sort by Date/Time in descending order (newest first)
        df = df.sort_values('Date/Time', ascending=False).reset_index(drop=True)

        # Calculate price fluctuation
        df['StockFluctuation'] = df['Close'] - df['Close'].shift(-N)

        # Drop rows with NaN values resulting from the shift operation
        df = df.dropna()

        dataframes.append(df)

    # Combine all dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)

    return combined_df

# List of file paths (adjust these to match your actual file paths)
file_paths = [
    'data/FPT.csv',
    'data/MSN.csv',
    'data/PNJ.csv',
    'data/VIC.csv'
]

# Set N for the number of periods to look ahead (e.g., 1 for next minute, 5 for 5 minutes ahead, etc.)
N = 1

# Process the data
result_df = load_and_process_stock_data(file_paths, N)

# Display the first few rows and info of the resulting dataframe
print(result_df.head())
print(result_df.info())

# Optional: Save the processed data to a new CSV file
result_df.to_csv('processed_stock_data.csv', index=False)
print("Processed data saved to 'processed_stock_data.csv'")

  Ticker           Date/Time  Open  High   Low  Close  Volume  Open Interest  \
0    FPT 2020-12-22 14:46:00  58.1  58.1  58.1   58.1   11170              0   
1    FPT 2020-12-22 14:29:00  58.1  58.1  58.1   58.1    2500              0   
2    FPT 2020-12-22 14:26:00  58.2  58.2  58.2   58.2     500              0   
3    FPT 2020-12-22 14:25:00  58.2  58.2  58.2   58.2   14820              0   
4    FPT 2020-12-22 14:24:00  58.2  58.2  58.2   58.2   27470              0   

   StockFluctuation  
0               0.0  
1              -0.1  
2               0.0  
3               0.0  
4               0.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459331 entries, 0 to 459330
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Ticker            459331 non-null  object        
 1   Date/Time         459331 non-null  datetime64[ns]
 2   Open              459331 non-null  float64       
 3

In [30]:
# Optional: Group by Ticker and display summary statistics
summary_stats = result_df.groupby('Ticker').agg({
    'Open': 'mean',
    'High': 'max',
    'Low': 'min',
    'Close': 'mean',
    'Volume': 'sum',
    'StockFluctuation': ['mean', 'std']
})
print("\nSummary Statistics by Ticker:")
print(summary_stats)


Summary Statistics by Ticker:
              Open    High    Low       Close     Volume StockFluctuation  \
              mean     max    min        mean        sum             mean   
Ticker                                                                      
FPT      45.071776   58.50  30.22   45.071777  783515270         0.000279   
MSN      74.878822  118.00  46.40   74.880772  769290770         0.000072   
PNJ      72.182005  100.16  44.04   72.179317  379915372        -0.000006   
VIC     105.376528  126.50  68.00  105.378859  322292790         0.000071   

                  
             std  
Ticker            
FPT     0.063485  
MSN     0.216196  
PNJ     0.690948  
VIC     0.199338  


# Data preprocessing


## Scaling data

In [31]:
# Initialize scalers for features and target
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Define feature columns to scale
feature_columns = ['Open', 'High', 'Low', 'Close', 'Volume']

# Apply scaling for each company
scaled_dfs = []
for ticker, group in result_df.groupby('Ticker'):
    # Scale features
    group[feature_columns] = feature_scaler.fit_transform(group[feature_columns])
    # Scale target (StockFluctuation)
    group['StockFluctuation'] = target_scaler.fit_transform(group[['StockFluctuation']])
    scaled_dfs.append(group)

# Combine back the scaled data
scaled_combined_df = pd.concat(scaled_dfs, axis=0)


## Create Time Series Sequences for LSTM

In [32]:
def create_sequences(data, feature_columns, target_column, lookback):
    X, y = [], []
    for i in range(len(data) - lookback):
        X.append(data[feature_columns].iloc[i:i+lookback].values)
        y.append(data[target_column].iloc[i+lookback])
    return np.array(X), np.array(y)

X_list, y_list = [], []
lookback_period = 30

for ticker, group in scaled_combined_df.groupby('Ticker'):
    X, y = create_sequences(group, feature_columns, 'StockFluctuation', lookback_period)
    X_list.append(X)
    y_list.append(y)

# Concatenate all the sequences from different companies
X_combined = np.concatenate(X_list, axis=0)
y_combined = np.concatenate(y_list, axis=0)


## Train-Test Split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, shuffle=False)

# Build LSTM model


In [8]:

model = Sequential()

# Add LSTM layers
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50))

# Output layer to predict stock fluctuation
model.add(Dense(units=1))

  super().__init__(**kwargs)


In [9]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [10]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 7ms/step - loss: 0.0084 - val_loss: 0.0061
Epoch 2/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 7ms/step - loss: 0.0059 - val_loss: 0.0067
Epoch 3/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 7ms/step - loss: 0.0055 - val_loss: 0.0067
Epoch 4/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 7ms/step - loss: 0.0051 - val_loss: 0.0074
Epoch 5/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 7ms/step - loss: 0.0049 - val_loss: 0.0080
Epoch 6/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 7ms/step - loss: 0.0048 - val_loss: 0.0088
Epoch 7/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 7ms/step - loss: 0.0047 - val_loss: 0.0084
Epoch 8/100
[1m11481/11481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 7ms/step - loss: 0.0047 - val_loss

In [11]:
model.save('keras_model.h5')



# Evaluate

In [34]:
y_pred = model.predict(X_test)
y_pred_rescaled = target_scaler.inverse_transform(y_pred)
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))

[1m2871/2871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step


### MSE

In [35]:
mse = mean_squared_error(y_pred_rescaled, y_test_rescaled)
print(f'Mean Squared Error: {mse}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

Mean Squared Error: 2.2274699621398075
Root Mean Squared Error: 1.4924710925642102


### MAE

In [36]:
mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 1.3556994579401462


### R2 score

In [37]:
r2 = r2_score(y_test_rescaled, y_pred_rescaled)
print("R2 score:", r2)

R2 score: -55.43829290611654
