In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/stat_project/analysis')

In [3]:
%cd /content/drive/MyDrive/stat_project/analysis

/content/drive/MyDrive/stat_project/analysis


In [4]:
import http.client
import json
import pandas as pd
import os
from datetime import datetime

# Create a folder to save data if it doesn't exist
output_folder = '/content/drive/MyDrive/stat_project/analysis/trainingset'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to crawl data from FPT's API with multiple pages and customizable date range
def fetch_fpt_data(symbol, start_date, end_date):
    headers = {
        "Accept": "application/json",
        "Referer": "https://s.cafef.vn/",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    conn = http.client.HTTPSConnection("s.cafef.vn")
    all_data = []
    page_index = 1
    page_size = 100  # Customize the number of records per page if needed

    while True:
        url = f"/Ajax/PageNew/DataHistory/PriceHistory.ashx?Symbol={symbol}&StartDate={start_date}&EndDate={end_date}&PageIndex={page_index}&PageSize={page_size}"
        conn.request("GET", url, headers=headers)
        res = conn.getresponse()
        data = res.read()
        conn.close()

        if res.status != 200:
            print(f"status code: {res.status}")
            return None

        try:
            json_data = json.loads(data.decode("utf-8"))

            # Check if the 'Data' field exists and contains data
            if "Data" in json_data and "Data" in json_data["Data"] and json_data["Data"]["Data"]:
                page_data = pd.json_normalize(json_data["Data"]["Data"])
                all_data.append(page_data)
                page_index += 1  # Move to the next page
            else:
                print("No more or invalid data.")
                break  # Exit the loop when no more data is available
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return None

    # Combine all data from pages into a single DataFrame
    if all_data:
        full_data = pd.concat(all_data, ignore_index=True)
        full_data = full_data[["Ngay", "GiaDongCua", "GiaMoCua"]]  # Select columns for Date, Close Price, and Open Price
        full_data.rename(columns={"Ngay": "date", "GiaDongCua": "close_price", "GiaMoCua": "open_price"}, inplace=True)

        # Save the data to a CSV file
        output_path = os.path.join(output_folder, f'{symbol}_stock_data.csv')
        full_data.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"Data has been saved to {output_path}")
    else:
        print("No data to save.")

# Run the function to crawl data with the desired parameters
symbol = "FPT"
start_date = "2020-01-01"  # Start date (format: YYYY-MM-DD)
end_date = datetime.now().strftime("%Y-%m-%d")  # Current date
fetch_fpt_data(symbol, start_date, end_date)


No more or invalid data.
Data has been saved to /content/drive/MyDrive/stat_project/analysis/trainingset/FPT_stock_data.csv
