# CRAWL STOCK API

## Nguồn dữ liệu: DStock, FireAnt, VietStock Finance

### 1. Chuẩn bị dữ liệu

Import các thư viện cần thiết cho việc thu thập dữ liệu.

In [9]:
import requests
import pandas as pd

Lấy thông tin các công ty đã từng được đưa lên sàn chứng khoán Việt Nam

In [3]:
def get_stock_list():
    VNDIRECT_API = 'https://finfo-api.vndirect.com.vn/v4/stocks'
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'Content-Type': 'application/json',
    }
    
    params = {
        "q": "type:stock,ifc~floor:HOSE,HNX,UPCOM",
        "size": "9999"
    }
    
    response = requests.get(url = VNDIRECT_API, params = params, headers = headers)
    
    if response.status_code == 200:
        field_to_get = ["code", "type", "status", "companyName", "listedDate", "delistedDate"]
        df = pd.DataFrame(response.json()["data"])
        df = df[field_to_get]
        return pd.DataFrame(df)
        
    return []

In [4]:
stock_data = get_stock_list()
stock_data.head()

Unnamed: 0,code,type,status,companyName,listedDate,delistedDate
0,ENF,IFC,delisted,Quỹ Đầu tư Năng động Eastspring Investments Vi...,2001-01-01,2001-01-01
1,FUCTVGF3,IFC,listed,Quỹ đầu tư tăng trưởng Thiên Việt 3,2021-10-27,
2,VFMVFA,IFC,delisted,Quỹ Đầu tư năng động Việt Nam,2010-08-09,2013-03-11
3,MAFPF1,IFC,delisted,Quỹ đầu tư tăng trưởng Manulife,2007-12-28,2014-08-20
4,VFMVF4,IFC,delisted,Quỹ đầu tư Doanh nghiệp hàng đầu Việt Nam,2008-06-12,2013-11-26


Data clean up

In [5]:
import re

def cleanRoman(text):
    pattern = r'\b(?=[MDCLXVIΙ])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})([IΙ]X|[IΙ]V|V?[IΙ]{0,3})\b\.?'
    return re.sub(pattern, '', text)

def cleanText(text):
    pattern = r'[^A-Za-z]+'
    return re.sub(pattern, '', text)

def cleanBullet(text):
    pattern = '\w[.)]\s*'
    return re.sub(pattern, '', text)

def removeVietNameAccent(s):
    s = re.sub(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', s)
    s = re.sub(r'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', s)
    s = re.sub(r'[èéẹẻẽêềếệểễ]', 'e', s)
    s = re.sub(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = re.sub(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', s)
    s = re.sub(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', s)
    s = re.sub(r'[ìíịỉĩ]', 'i', s)
    s = re.sub(r'[ÌÍỊỈĨ]', 'I', s)
    s = re.sub(r'[ùúụủũưừứựửữ]', 'u', s)
    s = re.sub(r'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', s)
    s = re.sub(r'[ỳýỵỷỹ]', 'y', s)
    s = re.sub(r'[ỲÝỴỶỸ]', 'Y', s)
    s = re.sub(r'[Đ]', 'D', s)
    s = re.sub(r'[đ]', 'd', s)
    return s

def removeSpace(text):
    pattern = r'\s*'
    return re.sub(pattern, '', text)

Lấy các chỉ số báo cáo tài chính

In [6]:
import datetime

def get_balance_sheet(symbol = "VNM", fromYear = 2021, toYear = 2022):
    companyBalanceSheet = []
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    }
    fireant_bearer_token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6IkdYdExONzViZlZQakdvNERWdjV4QkRITHpnSSIsImtpZCI6IkdYdExONzViZlZQakdvNERWdjV4QkRITHpnSSJ9.eyJpc3MiOiJodHRwczovL2FjY291bnRzLmZpcmVhbnQudm4iLCJhdWQiOiJodHRwczovL2FjY291bnRzLmZpcmVhbnQudm4vcmVzb3VyY2VzIiwiZXhwIjoxOTM5NDc0NDY3LCJuYmYiOjE2Mzk0NzQ0NjcsImNsaWVudF9pZCI6ImZpcmVhbnQudHJhZGVzdGF0aW9uIiwic2NvcGUiOlsib3BlbmlkIiwicHJvZmlsZSIsInJvbGVzIiwiZW1haWwiLCJhY2NvdW50cy1yZWFkIiwiYWNjb3VudHMtd3JpdGUiLCJvcmRlcnMtcmVhZCIsIm9yZGVycy13cml0ZSIsImNvbXBhbmllcy1yZWFkIiwiaW5kaXZpZHVhbHMtcmVhZCIsImZpbmFuY2UtcmVhZCIsInBvc3RzLXdyaXRlIiwicG9zdHMtcmVhZCIsInN5bWJvbHMtcmVhZCIsInVzZXItZGF0YS1yZWFkIiwidXNlci1kYXRhLXdyaXRlIiwidXNlcnMtcmVhZCIsInNlYXJjaCIsImFjYWRlbXktcmVhZCIsImFjYWRlbXktd3JpdGUiLCJibG9nLXJlYWQiLCJpbnZlc3RvcGVkaWEtcmVhZCJdLCJzdWIiOiJkM2UxY2I4MC0xMDc0LTRhMjItYWY4Ny0yNjlhOGM3Mzc2NmMiLCJhdXRoX3RpbWUiOjE2Mzk0NzQ0NjcsImlkcCI6Ikdvb2dsZSIsIm5hbWUiOiJtaW5odHJpLm1pbmh6enh6eEBnbWFpbC5jb20iLCJzZWN1cml0eV9zdGFtcCI6ImIzNDM3MmFkLTgxZjktNGUyYy04NTc4LTBmYWE3NmIxYmMzOSIsInByZWZlcnJlZF91c2VybmFtZSI6Im1pbmh0cmkubWluaHp6eHp4QGdtYWlsLmNvbSIsInVzZXJuYW1lIjoibWluaHRyaS5taW5oenp4enhAZ21haWwuY29tIiwiZnVsbF9uYW1lIjoiTWluaCBUcmkgTmd1eWVuIiwiZW1haWwiOiJtaW5odHJpLm1pbmh6enh6eEBnbWFpbC5jb20iLCJlbWFpbF92ZXJpZmllZCI6InRydWUiLCJqdGkiOiIzY2FjMTQwZGIxMTRkNGMwOWI2MWJjNTA1NmQ0MDg0OCIsImFtciI6WyJleHRlcm5hbCJdfQ.X9deVcDttd06BxdZC7uOBXeObi3qOYqIsWK190UXRBSbVw-03W4KlsQ5PwKyoAc5beog9zYTtZzoE63cnbJ4o14aq4ljsM4bcFEfP2wLl3taVjuKbJOKaFMLiUFyQGiPc5_iE7b-7Z3cVWyEWtDl9xeqg57vVrBLXvcyzquWTFVKgaumR7PA3EwM5UHQWL8f2nx_zwAW06Y-x6soQItu8byN4Brm6VZK6YawUikZqsNehRxHmd_Q52rd4WJ5cTnLUHSlHNoKzEVOobfvOStE2bkoEceBuwgnjEIgqvFsdEX26lvi7ytkkUad9_Mm4LIs_-MxAnsoop3K0IFMzgq-IQ"
    headers.update({'Authorization': f"Bearer {fireant_bearer_token}"})
    
    FIREANT_API = f"https://restv2.fireant.vn/symbols/{symbol}/full-financial-reports?"
    
    field_to_get = [
        { 'tongcongtaisan': 'totalAssets'},
        { 'taisancodinhhuuhinh': 'tangibleAssets'},
        { 'taisancodinhvohinh': 'intangibleAssets'},
        { 'doanhthuthuan': 'netRevenue'},
        { 'loinhuantruocthue': 'profitBeforeTaxes'},
        { 'loinhuansauthuecuacodongcuacongtyme': 'profitAfterTaxes'},
        { 'tonghangtonkho': 'inventory'},
        { 'nophaitra': 'liabilities'},
        { 'tienvatuongduongtiencuoiky': 'cashAndCashEquivalents'},
        { 'vonchusohuu': 'equity'},
        { 'nonganhan': 'shorttermLiabilities'},
        { 'nodaihan': 'longtermLiabilities'},
        { 'giavonhangban': 'costPrice'},
        { 'khauhaotscd': 'fixedAssetsDepreciation'},
        { 'trongdochiphilaivay': 'lendingCost'},
        { 'vayvanothuetaichinhnganhan': 'shorttermBorrowingsFinancialLeases'},
        { 'vayvanothuetaichinhdaihan': 'longtermBorrowingsFinancialLeases'},
    ]
    
    for year in range(fromYear, toYear+1):
        for quarter in range(1, 4+1):
            if datetime.datetime(year, quarter*3, 1) > datetime.datetime.now():
                continue
            
            print(f"Getting data for {symbol} - {year} - {quarter}")
                  
            quarterBalanceSheet = {}
                                    
            for field in field_to_get:
                quarterBalanceSheet.update({list(field.values())[0] : 0})
             
            # 1  : "candoiketoan"
            # 2  : "ketquakinhdoanh"
            # 3  : "luuchuyentientett"
            # 4  : "luuchuyentientegt"
            for statementType in range(1, 5):                    
                params = {
                    "type": statementType, 
                    "year": year,
                    "quarter": quarter,
                    "limit": 1,
                }
                
                fireant_response = requests.get(FIREANT_API, headers=headers, params=params).json()
                                
                try:
                    if (fireant_response != None):
                        for field_respone in fireant_response:
                            if ('name' not in field_respone): continue
                            
                            field_name =  removeSpace(cleanText(removeVietNameAccent(cleanBullet(cleanRoman(field_respone['name']))))).lower()
                            field_value = field_respone['values'][0]['value'] or 0
                            
                            for field in field_to_get:
                                if field_name in field: quarterBalanceSheet.update({field[field_name]: str(field_value)})
                    
                    quarterBalanceSheet.update({f"year": year})
                    quarterBalanceSheet.update({f"quarter": quarter})
                    
                except:
                    print(f"{symbol} - {year} - {quarter} - {statementType} - {fireant_response}")
                    raise
                
            companyBalanceSheet.append(quarterBalanceSheet)
    return companyBalanceSheet

Lấy chỉ số báo cáo tài chính của công ty

In [7]:
print(get_balance_sheet())

Getting data for VNM - 2021 - 1
Getting data for VNM - 2021 - 2
Getting data for VNM - 2021 - 3
Getting data for VNM - 2021 - 4
Getting data for VNM - 2022 - 1
Getting data for VNM - 2022 - 2
Getting data for VNM - 2022 - 3
Getting data for VNM - 2022 - 4
[{'totalAssets': '51051210880651.0', 'tangibleAssets': '12450177265747.0', 'intangibleAssets': '1131385809125.0', 'netRevenue': '13190270122852.0', 'profitBeforeTaxes': '3153910222805.0', 'profitAfterTaxes': '2575916578653.0', 'inventory': '6465943104329.0', 'liabilities': '17087270259224.0', 'cashAndCashEquivalents': '1197956795582.0', 'equity': '33963940621427.0', 'shorttermLiabilities': '16532816332955.0', 'longtermLiabilities': '554453926269.0', 'costPrice': '7435389686438.0', 'fixedAssetsDepreciation': '561074222639.0', 'lendingCost': '12905990201.0', 'shorttermBorrowingsFinancialLeases': '9247097278828.0', 'longtermBorrowingsFinancialLeases': '162103516784.0', 'year': 2021, 'quarter': 1}, {'totalAssets': '53046669029192.0', 'tan

In [8]:
mask_active_stock_comapny = (stock_data['status'] == 'listed') & (stock_data['type'] == 'STOCK')
active_company = stock_data[mask_active_stock_comapny]
active_company

Unnamed: 0,code,type,status,companyName,listedDate,delistedDate
13,AMC,STOCK,listed,Công ty Cổ phần Khoáng sản Á Châu,2012-02-15,
14,TTH,STOCK,listed,Công ty Cổ phần Thương mại và Dịch vụ Tiến Thành,2016-10-26,
15,ART,STOCK,listed,Công ty cổ phần Chứng khoán BOS,2018-09-28,
16,DVG,STOCK,listed,CTCP TẬP ĐOÀN SƠN ĐẠI VIỆT,2021-01-14,
17,TKG,STOCK,listed,Công ty cổ phần Sản xuất và Thương mại Tùng Khánh,2022-08-29,
...,...,...,...,...,...,...
1888,MIG,STOCK,listed,Tổng Công ty Cổ phần Bảo Hiểm Quân Đội,2021-01-21,
1889,HNG,STOCK,listed,Công ty Cổ phần Nông nghiệp Quốc tế Hoàng Anh ...,2015-07-20,
1890,ITA,STOCK,listed,Công ty Cổ phần Đầu tư và Công nghiệp Tân Tạo,2006-11-15,
1891,DGC,STOCK,listed,Công ty cổ phần Tập đoàn Hóa chất Đức Giang,2020-07-28,


In [9]:
# get balance sheet for all active company and save to csv file

for index, row in active_company.iterrows():
    symbol = row['code']
    company_balance_sheet = get_balance_sheet(symbol)
    
    pd.DataFrame(company_balance_sheet).to_csv(f"{symbol}.csv", index=False)

Getting data for VCC - 2021 - 1
Getting data for VCC - 2021 - 2
Getting data for VCC - 2021 - 3
Getting data for VCC - 2021 - 4
Getting data for VCC - 2022 - 1
Getting data for VCC - 2022 - 2
Getting data for VCC - 2022 - 3
Getting data for CTB - 2021 - 1
Getting data for CTB - 2021 - 2
Getting data for CTB - 2021 - 3
Getting data for CTB - 2021 - 4
Getting data for CTB - 2022 - 1
Getting data for CTB - 2022 - 2
Getting data for CTB - 2022 - 3
Getting data for TTT - 2021 - 1
Getting data for TTT - 2021 - 2
Getting data for TTT - 2021 - 3
Getting data for TTT - 2021 - 4
Getting data for TTT - 2022 - 1
Getting data for TTT - 2022 - 2
Getting data for TTT - 2022 - 3
Getting data for GDW - 2021 - 1
Getting data for GDW - 2021 - 2
Getting data for GDW - 2021 - 3
Getting data for GDW - 2021 - 4
Getting data for GDW - 2022 - 1
Getting data for GDW - 2022 - 2
Getting data for GDW - 2022 - 3
Getting data for SD9 - 2021 - 1
Getting data for SD9 - 2021 - 2
Getting data for SD9 - 2021 - 3
Getting 

### 2. Khám phá dữ liệu 

#### Đọc dữ liệu từ file csv vào data frame

In [1]:
import glob
import os
from pathlib import Path
import pandas as pd

In [3]:
#add your folder path
folder_path = r'C:\Learning Resouces\NM KHDL\NMKHDL_Project\crawl'

file_type = 'csv'
seperator =','

files = Path(folder_path).glob('*.csv')

dfs = list()
for f in files:
    data = pd.read_csv(f)
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file'] = f.stem
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

In [4]:
df

Unnamed: 0,totalAssets,tangibleAssets,intangibleAssets,netRevenue,profitBeforeTaxes,profitAfterTaxes,inventory,liabilities,cashAndCashEquivalents,equity,shorttermLiabilities,longtermLiabilities,costPrice,fixedAssetsDepreciation,lendingCost,shorttermBorrowingsFinancialLeases,longtermBorrowingsFinancialLeases,year,quarter,file
0,4.882955e+11,1.195395e+11,9.922500e+07,0.0,0.0,0.0,1.904507e+11,2.460785e+11,0.0,2.422229e+11,2.460785e+11,0.0,0.0,0.0,0.0,5.526196e+08,0.0,2021,1,A32
1,4.882955e+11,1.195395e+11,9.922500e+07,0.0,0.0,0.0,1.904507e+11,2.460785e+11,0.0,2.422229e+11,2.460785e+11,0.0,0.0,0.0,0.0,5.526196e+08,0.0,2021,2,A32
2,4.882955e+11,1.195395e+11,9.922500e+07,0.0,0.0,0.0,1.904507e+11,2.460785e+11,0.0,2.422229e+11,2.460785e+11,0.0,0.0,0.0,0.0,5.526196e+08,0.0,2021,3,A32
3,5.281309e+11,1.139765e+11,2.770250e+08,0.0,0.0,0.0,1.922260e+11,2.897457e+11,0.0,2.380572e+11,2.897457e+11,0.0,0.0,0.0,0.0,0.000000e+00,0.0,2021,4,A32
4,5.281309e+11,1.139765e+11,2.770250e+08,0.0,0.0,0.0,1.922260e+11,2.897457e+11,0.0,2.380572e+11,2.897457e+11,0.0,0.0,0.0,0.0,0.000000e+00,0.0,2022,1,A32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12088,7.808035e+11,4.702121e+10,4.776832e+09,0.0,0.0,0.0,1.218183e+11,7.590569e+11,0.0,2.174660e+10,7.582749e+11,782000000.0,0.0,0.0,0.0,2.195408e+11,782000000.0,2021,4,YTC
12089,7.808035e+11,4.702121e+10,4.776832e+09,0.0,0.0,0.0,1.218183e+11,7.590569e+11,0.0,2.174660e+10,7.582749e+11,782000000.0,0.0,0.0,0.0,2.195408e+11,782000000.0,2022,1,YTC
12090,7.808035e+11,4.702121e+10,4.776832e+09,0.0,0.0,0.0,1.218183e+11,7.590569e+11,0.0,2.174660e+10,7.582749e+11,782000000.0,0.0,0.0,0.0,2.195408e+11,782000000.0,2022,2,YTC
12091,7.808035e+11,4.702121e+10,4.776832e+09,0.0,0.0,0.0,1.218183e+11,7.590569e+11,0.0,2.174660e+10,7.582749e+11,782000000.0,0.0,0.0,0.0,2.195408e+11,782000000.0,2022,3,YTC


#### Dữ liệu gồm có bao nhiêu dòng và cột?

In [5]:
num_rows = len(df.axes[0])
num_cols = len(df.axes[1])
print('Row: ',num_rows)
print("Cols: ",num_cols)

Row:  12093
Cols:  20


#### Mỗi dòng có ý nghĩa gì?

 - Một dòng cho biết báo cáo tài chính của 1 công ty trong 1 quý

#### Dữ liệu có các dòng bị lặp không?

In [6]:
have_duplicated_rows = False
for i in df.duplicated().items():
    if(i[1] == True):
        have_duplicated_rows = True

In [7]:
assert have_duplicated_rows == False

- Như vậy không có dòng nào bị lặp

#### Mỗi cột có ý nghĩa gì?

- totalAssets: tổng giá trị tài sản 
- tangibleAssets: tài sản cố định hữu hình
- intangibleAsets: tài sản cố định vô hình
- netRevenue: doanh thu thuần
- profitBeforeTaxes: lợi nhuận trước thuế
- profitAfterTaxes: lợi nhuận sau thuế của cổ đông công ty mẹ
- inventory: tổng hàng tồn kho
- liabilities: nợ phải trả
- cashAndCashEquivalents: tiền mặt và các khoảng tương đương tiền
- equity: vốn chủ sở hữu
- shorttermLiabilities: nợ ngắn hạn
- longtermLiabilities: nợ dài hạn
- costPrice: giá vốn hàng bán
- fixedAssetsDepreciation: khấu hao tài sản cố định
- lendingCost: chi phí lãi vay
- shorttermBorrowingsFinancialLeases: vay nợ tài chính ngắn hạn
- longtermBorrowingsFinancialLeases: vay nợ tài chính dài hạn
- year: năm tài chính
- quarter: quý
- file: mã code của công ty

#### Mỗi cột hiện đang có kiểu dữ liệu gì?

In [8]:
col_dtypes = df.dtypes
col_dtypes

totalAssets                           float64
tangibleAssets                        float64
intangibleAssets                      float64
netRevenue                            float64
profitBeforeTaxes                     float64
profitAfterTaxes                      float64
inventory                             float64
liabilities                           float64
cashAndCashEquivalents                float64
equity                                float64
shorttermLiabilities                  float64
longtermLiabilities                   float64
costPrice                             float64
fixedAssetsDepreciation               float64
lendingCost                           float64
shorttermBorrowingsFinancialLeases    float64
longtermBorrowingsFinancialLeases     float64
year                                    int64
quarter                                 int64
file                                   object
dtype: object

### Với mỗi cột có kiểu dữ liệu dạng numeric, các giá trị được phân bố như thế nào?

In [9]:
nume_col_df = df.drop(columns='file')
missing_ratio = (nume_col_df.isnull().sum() * 100 / len(nume_col_df))
min = nume_col_df.min()
max = nume_col_df.max()
row_name = ['missing_ratio','min','max']
nume_col_profiles_df = pd.DataFrame([missing_ratio,min,max],index=row_name )
nume_col_profiles_df


Unnamed: 0,totalAssets,tangibleAssets,intangibleAssets,netRevenue,profitBeforeTaxes,profitAfterTaxes,inventory,liabilities,cashAndCashEquivalents,equity,shorttermLiabilities,longtermLiabilities,costPrice,fixedAssetsDepreciation,lendingCost,shorttermBorrowingsFinancialLeases,longtermBorrowingsFinancialLeases,year,quarter
missing_ratio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,-50308510.0,-30135410000.0,-6368911000000.0,-5964033000000.0,0.0,-378529000.0,-50374150000.0,-7510581000000.0,-378529000.0,-14068170000.0,-31632240000.0,-1416283000000.0,-714379100000.0,-5000000.0,-500000000.0,2021.0,1.0
max,2048953000000000.0,106782300000000.0,23672800000000.0,84982540000000.0,18948680000000.0,14493840000000.0,129636400000000.0,418345600000000.0,290363800000000.0,164297100000000.0,286327000000000.0,158357700000000.0,82526280000000.0,6325887000000.0,3036936000000.0,62819710000000.0,110949500000000.0,2022.0,4.0


### Cột có kiểu dữ liệu dạng object, các giá trị được phân bố như thế nào?

In [10]:
object_col_df = df[["file"]] # tên file là tên của doanh nghiệp
missing_ratio = object_col_df.isnull().sum() * 100 / len(object_col_df)
num_diff_vals = object_col_df.nunique()
diff_vals = [object_col_df[col_name].dropna().unique() for col_name in object_col_df.columns]
r_name = ["missing_ratio", "num_diff_vals", "diff_vals"]

object_col_profiles_df = pd.DataFrame([missing_ratio,num_diff_vals,pd.Series(diff_vals,index=["file"])],index=r_name)
object_col_profiles_df


Unnamed: 0,file
missing_ratio,0.0
num_diff_vals,1602
diff_vals,"[A32, AAA, AAM, AAS, AAT, AAV, ABB, ABC, ABI, ..."
