In [41]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
# Import and pre-process 2023 data of DSE
import os
import pandas as pd

# Path to your CSV files
csv_folder_path = '/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/DSE_data_2023'

# Read CSV files and store in a dictionary
DSE_dict_2023 = {}
for filename in os.listdir(csv_folder_path):
    if filename.endswith('.csv'):
        # Extract date from filename
        date = filename.split('_')[-1].split('.')[0]
        # Read CSV file into DataFrame
        df = pd.read_csv(os.path.join(csv_folder_path, filename))
        # Add DataFrame to dictionary with date as key
        DSE_dict_2023[date] = df

In [43]:
# Example: Access DataFrame of DSE for a specific date
date = '20230101'  # Change this to the date you want
if date in DSE_dict_2023:
    print("Data for", date)
    print(DSE_dict_2023[date])
else:
    print("Data for", date, "not found")

Data for 20230101
         Date       Scrip        Open        High         Low       Close  \
0    20230101      00DS30  2195.30158  2205.14579  2192.68497  2193.60229   
1    20230101      00DSES  1358.83640  1365.93691  1355.43267  1355.59149   
2    20230101      00DSEX  6206.81389  6222.89281  6194.37735  6195.37281   
3    20230101  1STPRIMFMF    16.40000    16.40000    16.40000    16.40000   
4    20230101    AAMRANET    52.80000    52.80000    49.70000    50.30000   
..        ...         ...         ...         ...         ...         ...   
329  20230101    WATACHEM   200.20000   200.20000   200.20000   200.20000   
330  20230101  WMSHIPYARD    11.00000    11.00000    11.00000    11.00000   
331  20230101         YPL    18.70000    18.70000    18.70000    18.70000   
332  20230101  ZAHEENSPIN    12.30000    12.30000    12.30000    12.30000   
333  20230101  ZEALBANGLA   170.80000   170.80000   170.80000   170.80000   

         Volume  
0    1784259000  
1    1784259000  
2  

In [44]:
# Keep data of only indices DS30, DSEX and 30 stocks in DS30 index
# List of desired stock codes
DS30_stocks = [
    'IFIC',
    'BATBC',
    'BEACONPHAR',
    'BEXIMCO',
    'ORIONPHARM',
    'BRACBANK',
    'BSC',
    'DELTALIFE',
    'BSRMLTD',
    'POWERGRID',
    'FORTUNE',
    'GP',
    'BSCCL',
    'ROBI',
    'SOUTHEASTB',
    'ISLAMIBANK',
    'BBSCABLES',
    'BXPHARMA',
    'LHBL',
    'CITYBANK',
    'MPETROLEUM',
    'OLYMPIC',
    'GPHISPAT',
    'RENATA',
    'SEAPEARL',
    'TITASGAS',
    'SQURPHARMA',
    'UPGDCL',
    'UNIQUEHRL',
    'IDLC','00DS30','00DSEX'
]

# Iterate over dataframes and filter rows
for date, df in DSE_dict_2023.items():
    # Keep rows with desired stock codes
    df = df[df['Scrip'].isin(DS30_stocks)]
    # Update dataframe in dictionary
    DSE_dict_2023[date] = df

In [45]:
# Recheck dataframe to see if non-neccesary stocks have been removed
date = '20230101'  # Change this to the date you want
if date in DSE_dict_2023:
    print("Data for", date)
    print(DSE_dict_2023[date])
else:
    print("Data for", date, "not found")

Data for 20230101
         Date       Scrip        Open        High         Low       Close  \
0    20230101      00DS30  2195.30158  2205.14579  2192.68497  2193.60229   
2    20230101      00DSEX  6206.81389  6222.89281  6194.37735  6195.37281   
42   20230101       BATBC   518.70000   518.70000   518.70000   518.70000   
45   20230101   BBSCABLES    49.90000    49.90000    49.90000    49.90000   
53   20230101  BEACONPHAR   304.50000   304.50000   282.10000   284.00000   
57   20230101     BEXIMCO   115.60000   115.60000   115.60000   115.60000   
62   20230101    BRACBANK    38.50000    38.50000    38.50000    38.50000   
63   20230101         BSC   115.90000   116.50000   112.80000   113.20000   
64   20230101       BSCCL   218.90000   218.90000   218.90000   218.90000   
65   20230101     BSRMLTD    90.00000    90.00000    90.00000    90.00000   
67   20230101    BXPHARMA   146.20000   147.00000   146.20000   146.20000   
71   20230101    CITYBANK    21.80000    21.80000    21.80

In [46]:
# Concatenate all DataFrames in the dictionary
dse_df_2023 = pd.concat(DSE_dict_2023.values(), ignore_index=True)

# Print the resulting DataFrame
print(dse_df_2023)
dse_df_2023.to_csv('/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/dse_df_2023.csv', index=False)


          Date       Scrip        Open        High         Low       Close  \
0     20230101      00DS30  2195.30158  2205.14579  2192.68497  2193.60229   
1     20230101      00DSEX  6206.81389  6222.89281  6194.37735  6195.37281   
2     20230101       BATBC   518.70000   518.70000   518.70000   518.70000   
3     20230101   BBSCABLES    49.90000    49.90000    49.90000    49.90000   
4     20230101  BEACONPHAR   304.50000   304.50000   282.10000   284.00000   
...        ...         ...         ...         ...         ...         ...   
7220  20231228    SEAPEARL    81.70000    99.70000    81.70000    99.70000   
7221  20231228  SOUTHEASTB    13.30000    13.30000    13.30000    13.30000   
7222  20231228  SQURPHARMA   210.80000   211.20000   209.90000   210.30000   
7223  20231228    TITASGAS    40.90000    40.90000    40.90000    40.90000   
7224  20231228   UNIQUEHRL    55.40000    56.90000    55.20000    56.70000   

          Volume  
0     1784259000  
1     1784259000  
2     

In [47]:
!pip install xlwt
import os
import requests
import pandas as pd
# Make a scraping code to extract stock price data for VN market in 2019
def scrape_and_save_to_excel(symbol, start_date, end_date, page_index=1, page_size=365):
    # API endpoint URL
    api_url = f"https://s.cafef.vn/Ajax/PageNew/DataHistory/PriceHistory.ashx?Symbol={symbol}&StartDate={start_date}&EndDate={end_date}&PageIndex={page_index}&PageSize={page_size}"

    # Send HTTP GET request to the API
    response = requests.get(api_url)

    # Check if request was successful
    if response.status_code == 200:
        # Parse JSON response
        data = response.json()

        # Extract historical price data
        historical_data = data['Data']['Data']

        # Create DataFrame
        df = pd.DataFrame(historical_data)

        # Define target directory
        target_directory = '/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019'

        # Create target directory if it doesn't exist
        os.makedirs(target_directory, exist_ok=True)

        # Save DataFrame to Excel file in target directory
        # Replace slashes with underscores in the filename
        excel_filename = f"{symbol}_{start_date.replace('/', '_')}_{end_date.replace('/', '_')}.xls"
        excel_filepath = os.path.join(target_directory, excel_filename)
        df.to_excel(excel_filepath, index=False)
        print(f"Excel file saved: {excel_filepath}")
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")

# Run iteration through all VN30 stocks and 2 indices
tickers = [
    'CII', 'CTD', 'CTG', 'DHG', 'DPM', 'EIB', 'FPT', 'GAS', 'GMD', 'HDB',
    'HPG', 'MBB', 'MSN', 'MWG', 'NVL', 'PNJ', 'REE', 'ROS', 'SAB', 'SBT',
    'SSI', 'STB', 'TCB', 'VCB', 'VHM', 'VIC', 'VJC', 'VNM', 'VPB', 'VRE',
    'VNINDEX', 'VN30INDEX'
]

start_date = '01/01/2019'
end_date = '12/31/2019'

for symbol in tickers:
    scrape_and_save_to_excel(symbol, start_date, end_date)




  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/CII_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/CTD_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/CTG_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/DHG_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/DPM_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/EIB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/FPT_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/GAS_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/GMD_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/HDB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/HPG_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/MBB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/MSN_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/MWG_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/NVL_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/PNJ_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/REE_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/ROS_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/SAB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/SBT_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/SSI_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/STB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/TCB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VCB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VHM_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VIC_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VJC_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VNM_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VPB_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VRE_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VNINDEX_01_01_2019_12_31_2019.xls
Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/VN30INDEX_01_01_2019_12_31_2019.xls


  df.to_excel(excel_filepath, index=False)


In [48]:
# Make a scraping code to extract stock price data for VN market in 2023
def scrape_and_save_to_excel(symbol, start_date, end_date, page_index=1, page_size=365):
    # API endpoint URL
    api_url = f"https://s.cafef.vn/Ajax/PageNew/DataHistory/PriceHistory.ashx?Symbol={symbol}&StartDate={start_date}&EndDate={end_date}&PageIndex={page_index}&PageSize={page_size}"

    # Send HTTP GET request to the API
    response = requests.get(api_url)

    # Check if request was successful
    if response.status_code == 200:
        # Parse JSON response
        data = response.json()

        # Extract historical price data
        historical_data = data['Data']['Data']

        # Create DataFrame
        df = pd.DataFrame(historical_data)

        # Define target directory
        target_directory = '/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023'

        # Create target directory if it doesn't exist
        os.makedirs(target_directory, exist_ok=True)

        # Save DataFrame to Excel file in target directory
        # Replace slashes with underscores in the filename
        excel_filename = f"{symbol}_{start_date.replace('/', '_')}_{end_date.replace('/', '_')}.xls"
        excel_filepath = os.path.join(target_directory, excel_filename)
        df.to_excel(excel_filepath, index=False)
        print(f"Excel file saved: {excel_filepath}")
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")

# Run iteration through all VN30 stocks and 2 indices
tickers = [
    'ACB', 'BCM', 'BID', 'BVH', 'CTG', 'FPT', 'GAS', 'GVR', 'HDB', 'HPG',
    'MBB', 'MSN', 'MWG', 'NVL', 'PDR', 'PLX', 'POW', 'SAB', 'SSI', 'STB',
    'TCB', 'TPB', 'VCB', 'VHM', 'VIB', 'VIC', 'VJC', 'VNM', 'VPB', 'VRE',
    'VNINDEX', 'VN30INDEX'
]

start_date = '01/01/2023'
end_date = '12/31/2023'

for symbol in tickers:
    scrape_and_save_to_excel(symbol, start_date, end_date)

  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/ACB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/BCM_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/BID_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/BVH_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/CTG_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/FPT_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/GAS_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/GVR_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/HDB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/HPG_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/MBB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/MSN_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/MWG_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/NVL_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/PDR_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/PLX_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/POW_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/SAB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/SSI_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/STB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/TCB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/TPB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VCB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VHM_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VIB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VIC_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VJC_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VNM_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VPB_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VRE_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VNINDEX_01_01_2023_12_31_2023.xls
Excel file saved: /content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/VN30INDEX_01_01_2023_12_31_2023.xls


  df.to_excel(excel_filepath, index=False)


In [49]:
# Import 2019 data of HOSE and pre-process
import os
import pandas as pd

# Function to process each file
def process_file(file_path):
    # Read the file into a DataFrame
    df = pd.read_excel(file_path)

    # Remove specified columns
    df.drop(columns=['GiaDieuChinh', 'ThayDoi', 'GiaTriKhopLenh', 'GtThoaThuan'], inplace=True)

    # Extract ticker from file name
    ticker = os.path.basename(file_path).split('_')[0]

    # Add 'ticker' column
    df.insert(1, 'ticker', ticker)

    # Calculate 'volume' and add as a new column
    df['volume'] = df['KhoiLuongKhopLenh'] + df['KLThoaThuan']

    # Remove columns 'KhoiLuongKhopLenh' and 'KLThoaThuan'
    df.drop(columns=['KhoiLuongKhopLenh', 'KLThoaThuan'], inplace=True)

    # Rename columns
    df.rename(columns={'Ngay': 'date', 'GiaDongCua': 'close', 'GiaMoCua': 'open', 'GiaCaoNhat': 'high', 'GiaThapNhat': 'low'}, inplace=True)

    return df

# Path to the folder containing the files
folder_path = '/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2019/'

# List to store DataFrames
dfs = []

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.xls'):
        file_path = os.path.join(folder_path, file_name)
        # Process each file and append the DataFrame to the list
        dfs.append(process_file(file_path))

# Concatenate all DataFrames into a single DataFrame
hose_df_2019 = pd.concat(dfs, ignore_index=True)


In [50]:
# Print the resulting DataFrame hose_df_2019 to check
print(hose_df_2019.head(n = 20))
print(hose_df_2019.tail(n = 20))

# Export cleaned data to another .csv file for backup
hose_df_2019.to_csv('/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/hose_df_2019.csv', index=False)

          date ticker  close   open   high    low   volume
0   31/12/2019    CII  22.50  22.40  22.50  22.00   285680
1   30/12/2019    CII  22.60  22.00  22.60  21.50   232600
2   27/12/2019    CII  22.00  22.00  22.05  21.90   155750
3   26/12/2019    CII  22.30  22.60  22.60  22.00    96450
4   25/12/2019    CII  22.60  22.40  22.60  21.90   183110
5   24/12/2019    CII  22.00  22.10  22.35  21.90   112620
6   23/12/2019    CII  22.10  22.60  22.75  22.10   558860
7   20/12/2019    CII  22.65  22.50  22.65  22.35   144250
8   19/12/2019    CII  22.50  22.80  22.80  22.40    76240
9   18/12/2019    CII  22.70  22.75  22.80  22.55   143720
10  17/12/2019    CII  22.75  22.60  22.75  22.30  1537540
11  16/12/2019    CII  22.75  22.60  22.90  22.55   255430
12  13/12/2019    CII  22.75  22.85  23.00  22.60    65840
13  12/12/2019    CII  22.75  22.40  22.85  22.35  2280580
14  11/12/2019    CII  22.50  22.90  22.90  22.50    70810
15  10/12/2019    CII  22.85  23.10  23.15  22.55  28715

In [51]:
# Re run the importing and pre-processing with data of 2023, HOSE
# Path to the folder containing the files
folder_path = '/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/HOSE_data_2023/'

# List to store DataFrames
dfs = []

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.xls'):
        file_path = os.path.join(folder_path, file_name)
        # Process each file and append the DataFrame to the list
        dfs.append(process_file(file_path))

# Concatenate all DataFrames into a single DataFrame
hose_df_2023 = pd.concat(dfs, ignore_index=True)

In [52]:
# Print the resulting DataFrame hose_df_2023 to check
print(hose_df_2023.head(n = 20))
print(hose_df_2023.tail(n = 20))

# Export cleaned data to another .csv file for backup
hose_df_2023.to_csv('/content/drive/My Drive/BISS/EMTH0009 - Master Thesis/Data/hose_df_2023.csv', index=False)


          date ticker  close  open  high   low   volume
0   29/12/2023    BCM   62.9  62.7  62.9  62.1   608700
1   28/12/2023    BCM   62.7  62.6  62.7  61.8   529400
2   27/12/2023    BCM   62.6  62.2  63.0  62.0   464200
3   26/12/2023    BCM   62.3  61.9  63.4  61.4   517700
4   25/12/2023    BCM   61.9  61.2  62.0  61.0   515700
5   22/12/2023    BCM   61.8  61.1  61.8  60.8   398000
6   21/12/2023    BCM   61.8  60.5  61.8  60.3   781800
7   20/12/2023    BCM   61.3  61.0  61.8  60.3   814100
8   19/12/2023    BCM   61.0  60.0  61.0  58.7  1038000
9   18/12/2023    BCM   60.7  61.5  61.5  59.7  1454400
10  15/12/2023    BCM   62.0  63.0  63.3  62.0   871900
11  14/12/2023    BCM   63.7  63.8  64.0  63.0   348300
12  13/12/2023    BCM   63.9  64.2  64.5  63.5  1005200
13  12/12/2023    BCM   64.4  65.0  65.3  64.1   336500
14  11/12/2023    BCM   65.1  65.1  65.9  63.9   453600
15  08/12/2023    BCM   65.0  64.5  67.0  64.4   514300
16  07/12/2023    BCM   64.8  67.7  67.7  63.3  