In [93]:
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep

from helpers import utils

In [73]:
url = "https://www.londonstockexchange.com/exchange/prices-and-markets/stocks/indices/summary/summary-indices-constituents.html?index=AIM1"

In [74]:
def fetch_page(content_url):
    print('Fetching ', content_url)
    r = requests.get(content_url)
    return BeautifulSoup(r.text, "html.parser")

In [75]:
def parse_first_page():
    soup = fetch_page(url)
    content = parse_table(soup)
    last_page = get_last_page(soup)
    
    return content, last_page

In [76]:
def get_last_page(soup):
    paging = soup.select_one('div.paging > p:nth-of-type(1)').string
    m = re.search(r'of (\d+)$', paging)
    if m:
        return int(m.groups()[0])
    return None

In [77]:
def parse_company(el):
    symbolId = re.search(r'\/(\w+)\.html', el.get('href'))
    if symbolId:
        symbolId = symbolId.group(1)
    
    return (el.string, symbolId)

In [78]:
def parse_table(soup):
    data = []
    rows = soup.find("table", class_="table_dati").find("tbody").findAll("tr")
    print('Reading Table')
    for tr in rows:
        cols = tr.findAll('td')

        company, symbolId = parse_company(cols[1].find('a'))
        item = {
            'symbol': cols[0].string,
            'company': company,
            'symbolId': symbolId,
            'currency': cols[2].string,
            'price': cols[3].string.replace(',','')
        }
        data.append(item)
    return data

In [88]:
def scrape_content():
    content, last_page = parse_first_page()
    sleep(3)
    for page in [i+1 for i in range(1, last_page)]:
        content_url = url + "&page={}".format(page)
        soup = fetch_page(content_url)
        page_content = parse_table(soup)
        content.extend(page_content)
        sleep(4)
    return content

content = scrape_content()
content[0]

{'symbol': 'ABC',
 'company': 'ABCAM',
 'symbolId': 'GB00B6774699GBGBXAMSM',
 'currency': 'GBX',
 'price': '1456.00'}

In [91]:
df = (
    pd.DataFrame(content)
    .assign(price = lambda x: pd.to_numeric(x.price))
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 5 columns):
company     101 non-null object
currency    101 non-null object
price       101 non-null float64
symbol      101 non-null object
symbolId    101 non-null object
dtypes: float64(1), object(4)
memory usage: 4.0+ KB


In [94]:
df.to_csv(utils.get_raw_file('aim100-components-20180808.csv'), index=False)