In [1]:
import re
import pandas as pd
import json
import requests
import html
from collections import defaultdict

In [2]:
# Import previously downloaded HTML file
file=open("MAS.html", 'r', encoding ='utf-8').read()

In [3]:
def textCleaner (text):

    return html.unescape(text).encode('ascii','ignore').decode().strip()

In [4]:
# Empty list to be filled with a list of lists: [recource id, resource name]
resources = []

# Regular Expressions used to detect resource ids, and resource names
patterns = [r"(?:https://secure.mas.gov.sg/api/APIDescPage.aspx\?resource_id=)(.+?)(?:\"\sid.+?\>)(?:\s+?)(.+?)(?:\<)",
            r"(?:https://secure.mas.gov.sg/api/APIDESCPAGE.ASPX\?RESOURCE_ID=)(.+?)(?:\"\sid.+?\>)(?:\s+?)(.+?)(?:\<)"]

# Use both patterns to scrape out the resource ids, and resource names
for pattern in patterns:
    resources+=(re.findall(pattern,file, re.DOTALL))

# Reduce this to unique pairs only; a precautionary step as there are no duplicates at this time
resources = set(resources)

print ('There are', len(resources),'resources:')

# Clean up the text using the textCleaner() function; removes special characters etc
resources = [[textCleaner(resource[0]),textCleaner(resource[1])] for resource in resources]

There are 43 resources:


In [5]:
# Display 3 resources to verify it is a list of lists: [resource id, resource name]
resources[:3]

[['2bc97ad7-4c0c-4013-8020-e95311db212f',
  'Table I.15 Asian Dollar Market: Maturities of Assets and Liabilities of ACUs'],
 ['c6147266-d4a9-48a6-8a11-5a5bc684f6c8',
  'Table II.6 General Insurance Companies: Premiums and Claims'],
 ['b67f7895-aab0-4fa2-8dcc-8587a0a3ea4e',
  'Table I.3C Banks: Liabilities of DBUs']]

In [6]:
def rawColumns(resource = '1c1713de-6b5e-475d-bc1e-b6a45b3e063e'):
    
    url = 'https://eservices.mas.gov.sg/api/action/datastore/search.json?resource_id='+resource+'&limit=1'
    
    result = requests.get(url).json()['result']['records']
            
    if 'preliminary' in result[0].keys():
        
        return True
    
    else:
        
        return False

In [7]:
def Columns(resource = '1c1713de-6b5e-475d-bc1e-b6a45b3e063e'):
    
    rawColumns = []
    finalDict = {}
    
    source = requests.get('https://secure.mas.gov.sg/api/APIDESCPAGE.ASPX?RESOURCE_ID='+resource).text
    
    if resource == '7e181136-d81a-48a8-9350-3f09265db3c7':
    
        source = source.replace('cards_main', 'principal_cardholders')
        source = source.replace('cards_supp', 'supp_cardholders')
        
    if resource == '9a0bf149-308c-4bd2-832d-76c8e6cb47ed':
        
        source = source.replace('rmb_overnight_rate','on_rmb_facility_rate')
        source = source.replace('sora','sor_average',1)

    pattern = r"(?:<td>)(?:\d+?)(?:</td>)(?:<td>)(.+?)(?:</td>)(?:<td>)(.+?)(?:</td>)(?:<td>)(?:.*?)(?:</td>)(?:<td>)(.+?)(?:</td>)(?:<td>)(.+?)(?:</td>)"    
    
    matches = re.findall(pattern, source, re.DOTALL)
    matches = [list(item) for item in matches]
    matches = [[html.unescape(subitem).encode("utf-8", "ignore").decode() for subitem in item] for item in matches]
    
    columnNames = [item[0] for item in matches]
    
    timeVariable = [item for item in columnNames if item in ['end_of_month', 'end_of_year', 'end_of_quarter', 'end_of_day']][0]
    matches = [item for item in matches if item[0] != timeVariable]
    
    return matches, timeVariable

In [8]:
def CSV(resource = ['1c1713de-6b5e-475d-bc1e-b6a45b3e063e']*2):
                
    length = 100
    offset = 0        
    
    rows, timeVariable = Columns(resource[0])
    columns = ['indicator', 'description', 'unit', 'notes']
    
    url = 'https://eservices.mas.gov.sg/api/action/datastore/search.json?resource_id='+resource[0]+'&limit=100&offset='
    
    while length == 100:
        
        call = url+str(offset)
    
        records = requests.get(call).json()['result']['records']
        
        for record in records:
            
            columns.append(record[timeVariable])
            
            for row in rows:
               
                row.append(record[row[0]])
            
        length = len(records)
        offset += 100
    
    return columns, rows

In [9]:
for resource in resources:

    print (resource)
    
    columns, rows = CSV(resource)
    
    df = pd.DataFrame(columns = columns)
    
    for i in range(len(rows)):
    
        df.loc[i] = rows[i]
        
    df.to_csv(resource[1]+'.csv')

['2bc97ad7-4c0c-4013-8020-e95311db212f', 'Table I.15 Asian Dollar Market: Maturities of Assets and Liabilities of ACUs']
['c6147266-d4a9-48a6-8a11-5a5bc684f6c8', 'Table II.6 General Insurance Companies: Premiums and Claims']
['b67f7895-aab0-4fa2-8dcc-8587a0a3ea4e', 'Table I.3C Banks: Liabilities of DBUs']
['15194366-fbda-424f-84c2-603308d6e231', 'Table I.2C Monetary Authority: Assets and Liabilities']
['27e076cd-23c6-4399-8906-f92e8a6327c7', 'Table II.2 Finance Companies: Loans and Advances']
['f9431948-e238-4919-ad46-c1ff952f755c', 'Table I.11 Banks: Combined Assets and Liabilities of DBUs and ACUs']
['3d115d8f-c4aa-4d31-a5c3-c1f4d4ecc4d7', 'Table I.6 Banks: Loan Limits Grantedto Non-bank Customers by Industry']
['3bdfa4ca-297b-45f0-bdf9-25ee0e1db59a', 'Table II.1 Finance Companies: Assets and Liabilities']
['0325c21e-767e-432f-8a77-2af1013792a7', 'Table III.2 Foreign Exchange Market Turnover']
['8e64ee9f-aaec-4313-a6f9-860fd36ea90b', 'Table I.3B Banks: Assets of DBUs']
['7e181136-d81