<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/equities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Equities

## Magic and Import

In [47]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [48]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt
import logging
import json

from pathlib import Path

## Setup

In [49]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Show Python version
import platform
platform.python_version()

'3.7.2'

In [50]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

## Global variables

In [51]:
# Component stocks URL
_URL_COMPONENT_STOCKS = 'https://www.investing.com/indices/ftse-malaysia-klci-components'

In [52]:
_DS_PATH = Path('datasets/equities')
_EQUITIES_JSON_FILE = _DS_PATH/'equities.json'
_EQUITIES_PICKLE_FILE = _DS_PATH/'equities.pickle'

os.makedirs(os.path.dirname(_EQUITIES_JSON_FILE), exist_ok=True)

In [105]:
# Import and install Scrapy
try:
    import scrapy
except:
    !pip install scrapy
    import scrapy
    
import scrapy.crawler as crawler
from scrapy.http import *
from multiprocessing import Process, Queue
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess

## Scrap Component Stocks

### Helper Functions

In [92]:
# the wrapper to make it run more times
def run_spider(spider):
    def f(q):
        try:
            runner = crawler.CrawlerRunner({'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'})
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

In [93]:
class JsonWriterPipeline(object):
    """JSON output writer"""

    def open_spider(self, spider):
        self.file = open(_EQUITIES_JSON_FILE, 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [114]:
class ComponentStocksSpider(scrapy.Spider):
    name = "component_stocks_spider"
    start_urls = [
        _URL_COMPONENT_STOCKS
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }

    def start_requests(self):
        # self.driver = webdriver.Chrome(self._CHROME_DRIVER)
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        #print(response.url)
        #print(response.body)
        rows = response.xpath('//*[@id="cr1"]/tbody/tr/td')
        print(rows)
        for row in rows:
            cols = row.find_elements_by_xpath(".//td")
            print(cols.get_attribute("innerText"))
        

### Download Component Stocks

In [115]:
# process = CrawlerProcess({
#     'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# })

# process.crawl(ComponentStocksSpider)
# process.start()

run_spider(ComponentStocksSpider)

[<Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="flag"><span title="Malaysia" '>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="bold left noWrap elp plusIcon'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="pid-41603-last">4.350</td>'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="pid-41603-high">4.460</td>'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="pid-41603-low">4.300</td>'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="bold greenFont pid-41603-pc">'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="bold greenFont pid-41603-pcp"'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class=" pid-41603-turnover">2.23M</t'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class=" pid-41603-time" data-value="'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td class="icon"><span class="redClockIc'>, <Selector xpath='//*[@id="cr1"]/tbody/tr/td' data='<td clas