Scrape Wall Street Journal webpage and fetch volume and open interest for a given list of futures contracts

In [1]:
import pandas as pd
import numpy as np
import requests
import bs4
import datetime
import pytz

In [7]:
def single_contract(url):
    
    print(f'Fetchig data... {url}')

    symbol = url.split('/')[-1]

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
    r = requests.get(url, headers=headers)
    soup = bs4.BeautifulSoup(r.text)

    # Close value. Quote
    quote_raw = soup.find('ul', class_='c_crinfo_main')
    quote = quote_raw.find('li', class_='crinfo_quote').select_one("span[id=quote_val]").text

    # Volume and Open Interest
    table_raw = soup.find('ul', class_='cr_data_collection cr_charts_info')

    vals, lbls = [symbol, quote], ['FX Contract', 'Quote']

    for row in table_raw.find_all('li'):
        lbl = row.find('span', class_='data_lbl').text
        val = row.find('span', class_='data_data').text

        lbls.append(lbl)
        vals.append(val)

    df = pd.DataFrame(vals).T
    df.columns = lbls

    return df

In [3]:
def scraper(urls):

    # Fetching data for all contracts, concatenating into one single df
    dfs = [single_contract(url) for url in urls]
    full = pd.concat(dfs).reset_index(drop=True)

    # Fixing data type
    full['Quote'] = full['Quote'].astype('float')
    full['Volume'] = full['Volume'].str.replace(',', '').astype('int')
    full['Open Interest'] = full['Open Interest'].str.replace(',', '').astype('int')

    # Adding Time
    utc_time = datetime.datetime.now(pytz.timezone('UTC')).strftime("%Y-%m-%d %H:%M:%S")
    local_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    ny_time = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")

    full['UTC'] = utc_time
    full['Local'] = local_time
    full['NY'] = ny_time

    return full

In [4]:
def save_to_csv(df):

    save_time = datetime.datetime.now(pytz.timezone('UTC')).strftime("%Y%m%dT%H%M%SZ")
    file = f'futures_wsj_{save_time}.csv'
    pathfile = f'{file}'
    df.to_csv(pathfile, index=False, sep=';')
    print(f'File saved: {pathfile}')

    return None

In [5]:
urls = ['https://www.wsj.com/market-data/quotes/futures/CL00',
        'https://www.wsj.com/market-data/quotes/futures/GC00',
        'https://www.wsj.com/market-data/quotes/futures/DX00',
        'https://www.wsj.com/market-data/quotes/futures/EC00',
        'https://www.wsj.com/market-data/quotes/futures/BP00',
        'https://www.wsj.com/market-data/quotes/futures/JY00',
        'https://www.wsj.com/market-data/quotes/futures/CD00',
        'https://www.wsj.com/market-data/quotes/futures/AD00',
        'https://www.wsj.com/market-data/quotes/futures/SFC00'
        ]

df = scraper(urls)
save_to_csv(df)

File saved: futures_wsj_20230921T133102Z.csv


In [6]:
df

Unnamed: 0,FX Contract,Quote,Volume,Open Interest,1 Day Range,52 Week Range,UTC,Local,NY
0,CL00,90.58,136314,345308,88.37 - 90.65,63.57 - 93.74,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
1,GC00,1934.5,128863,375019,1933.10 - 1952.20,1621.10 - 2085.40,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
2,DX00,105.27,9433,39639,105.090 - 105.440,99.220 - 114.745,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
3,EC00,1.0681,138806,673868,1.0656 - 1.0704,0.9592 - 1.1311,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
4,BP00,1.2273,115047,186910,1.2233 - 1.2347,1.0378 - 1.3146,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
5,JY00,0.6852,119045,262828,0.6828 - 0.6865,0.6623 - 0.7906,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
6,CD00,0.7415,47076,171758,0.7394 - 0.7439,0.7156 - 0.7644,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
7,AD00,0.6421,89294,220202,0.6403 - 0.6472,0.6181 - 0.7168,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
8,SFC00,1.1145,27429,45857,1.1116 - 1.1231,0.9901 - 1.1765,2023-09-21 13:31:02,2023-09-21 17:31:02,2023-09-21 09:31:02
