<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/equities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Equities

## Magic and Import

In [47]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [48]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt
import logging
import json

from pathlib import Path

## Setup

In [49]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Show Python version
import platform
platform.python_version()

'3.7.2'

In [50]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

## Global variables

In [51]:
# Component stocks URL
_URL_COMPONENT_STOCKS = 'https://www.investing.com/indices/ftse-malaysia-klci-components'

In [199]:
_DS_PATH = Path('datasets/equities')
_COMPONENT_STOCKS_JSON_FILE = _DS_PATH/'equities.json'
_COMPONENT_STOCKS_CSV_FILE = _DS_PATH/'equities.csv'

os.makedirs(os.path.dirname(_COMPONENT_STOCKS_JSON_FILE), exist_ok=True)

In [124]:
# Import and install Scrapy
try:
    import scrapy
except:
    !pip install scrapy
    import scrapy
    
import scrapy.crawler as crawler
from scrapy.http import *
from scrapy.selector import Selector
from scrapy.crawler import CrawlerProcess
from multiprocessing import Process, Queue
from twisted.internet import reactor

## Scrap Component Stocks

### Helper Functions

In [92]:
# the wrapper to make it run more times
def run_spider(spider):
    def f(q):
        try:
            runner = crawler.CrawlerRunner({'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'})
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

In [204]:
class JsonWriterPipeline(object):
    """JSON output writer"""

    def open_spider(self, spider):
        self.file = open(_COMPONENT_STOCKS_JSON_FILE, 'a', encoding='utf-8') 

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
    
class CsvWriterPipeline(object):
    """CSV output writer"""
    
    def open_spider(self, spider):
        self.file = open(_COMPONENT_STOCKS_CSV_FILE, 'a', encoding='utf-8') 
        writer = csv.writer(self)
        writer.writerow(['name', 'link'])

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
    

In [206]:
class ComponentStocksSpider(scrapy.Spider):
    name = "component_stocks_spider"
    start_urls = [
        _URL_COMPONENT_STOCKS
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.CsvWriterPipeline': 1}, # Used for pipeline 1
    }

    def start_requests(self):
        # self.driver = webdriver.Chrome(self._CHROME_DRIVER)
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        selector = Selector(response)
        table = selector.xpath('//*[@id="cr1"]')
        rows = table.xpath(".//tbody/tr")
        for row in rows:
            print(row.xpath('.//td[2]/a/text()').extract())
            print(row.xpath('.//td[2]/a/@href').extract())
        

### Download Component Stocks

In [197]:
# process = CrawlerProcess({
#     'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# })

# process.crawl(ComponentStocksSpider)
# process.start()

run_spider(ComponentStocksSpider)

['AMMB']
['/equities/ammb-holdings-bhd']
['Axiata']
['/equities/axiata-group-bhd']
['Bumiputra Commerce']
['/equities/bumiputra---commerce-holdings-bhd']
['Dialog']
['/equities/dialog-group-bhd']
['DiGi.Com']
['/equities/digi.com-bhd']
['Genting']
['/equities/genting-bhd']
['Genting Malaysia']
['/equities/genting-malaysia-bhd']
['Hap Seng Consolidated']
['/equities/hap-seng-consolidated-bhd']
['Hartalega']
['/equities/hartalega-holdings-bhd']
['Hong Leong Bank']
['/equities/hong-leong-bank-bhd']
['Hong Leong Financial']
['/equities/hong-leong-financial-group-bhd']
['IHH Healthcare']
['/equities/ihh-healthcare-bhd']
['IOI Corp']
['/equities/ioi-corporation-bhd']
['Kuala Lumpur Kepong']
['/equities/kuala-lumpur-kepong-bhd']
['Malayan Banking']
['/equities/malayan-banking-bhd']
['Malaysia Airport']
['/equities/malaysia-airport-holdings-bhd']
['Maxis']
['/equities/maxis-bhd']
['MISC']
['/equities/misc-bhd']
['Nestle']
['/equities/nestle-(malaysia)-bhd']
['Petronas Chemicals']
['/equities/p