<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/equities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Equities

## Magic and Import

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
# import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt
import logging
import json
import csv

from pathlib import Path

## Setup

In [3]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Show Python version
import platform
platform.python_version()

'3.8.3'

In [4]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

## Global variables

In [5]:
# Component stocks URL
_URL_COMPONENT_STOCKS = 'https://www.investing.com/indices/ftse-malaysia-klci-components'

In [6]:
_DS_PATH = Path('datasets/equities')
_COMPONENT_STOCKS_JSON_FILE = _DS_PATH/'equities.json'
_COMPONENT_STOCKS_CSV_FILE = _DS_PATH/'equities.csv'

os.makedirs(os.path.dirname(_COMPONENT_STOCKS_JSON_FILE), exist_ok=True)

In [7]:
# Import and install Scrapy
try:
    import scrapy
except:
    !pip install scrapy
    import scrapy
    
import scrapy.crawler as crawler
from scrapy.http import *
from scrapy.selector import Selector
from scrapy.crawler import CrawlerProcess
from multiprocessing import Process, Queue
from twisted.internet import reactor

## Scrap Component Stocks

### Helper Functions

In [8]:
class JsonWriterPipeline(object):
    """JSON output writer"""

    def open_spider(self, spider):
        self.file = open(_COMPONENT_STOCKS_JSON_FILE, 'a', encoding='utf-8') 

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
    
class CsvWriterPipeline(object):
    """CSV output writer"""
    
    def open_spider(self, spider):
        self.file = open(_COMPONENT_STOCKS_CSV_FILE, 'a', encoding='utf-8') 
        self.writer = csv.writer(self)
        self.writer.writerow(['name', 'link'])

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.writer.writerow(line)
        return item
    

In [9]:
class ComponentStocksSpider(scrapy.Spider):
    name = "component_stocks_spider"
    start_urls = [
        _URL_COMPONENT_STOCKS
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        # 'ITEM_PIPELINES': {'__main__.CsvWriterPipeline': 1}, # Used for pipeline 1
    }

    def start_requests(self):
        # self.driver = webdriver.Chrome(self._CHROME_DRIVER)
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        selector = Selector(response)
        table = selector.xpath('//*[@id="cr1"]')
        rows = table.xpath(".//tbody/tr")
        for row in rows:
            print(row.xpath('.//td[2]/a/text()').extract())
            # print(row.xpath('.//td[2]/a/@href').extract())
        

### Download Component Stocks

In [10]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(ComponentStocksSpider)
process.start()

2020-10-21 13:49:54 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: scrapybot)
2020-10-21 13:49:54 [scrapy.utils.log] INFO: Versions: lxml 4.5.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.3 (default, May 19 2020, 13:54:14) - [Clang 10.0.0 ], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.9.2, Platform macOS-10.15.7-x86_64-i386-64bit
2020-10-21 13:49:54 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-10-21 13:49:54 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<Deferred at 0x7fb791562bb0>

['AMMB']
['Axiata']
['CIMB Group']
['Dialog']
['DiGi.Com']
['Genting']
['Genting Malaysia']
['Hap Seng Consolidated']
['Hartalega']
['Hong Leong Bank']
['Hong Leong Financial']
['IHH Healthcare']
['IOI Corp']
['Kuala Lumpur Kepong']
['Malayan Banking']
['Malaysia Airport']
['Maxis']
['MISC']
['Nestle']
['Petronas Chemicals']
['Petronas Dagangan']
['Petronas Gas']
['PPB']
['Press Metal Bhd']
['Public Bank']
['RHB Bank']
['Sime Darby']
['Sime Darby Plantation']
['Tenaga Nasional']
['Top Glove Corp']
