In [5]:
import numpy as np
import pandas as pd
from pandas import DataFrame

In [6]:
from lxml.html import parse
from urllib.request import urlopen

In [4]:
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))

In [5]:
doc = parsed.getroot()

In [6]:
links = doc.findall('.//a')

In [13]:
links[15:20]

[<Element a at 0x7f1e854d2368>,
 <Element a at 0x7f1e854d23b8>,
 <Element a at 0x7f1e854d2408>,
 <Element a at 0x7f1e854d2458>,
 <Element a at 0x7f1e854d24a8>]

In [20]:
lnk = links[0]
lnk

<Element a at 0x7f1e854b8b88>

In [21]:
lnk.get('href')

'https://finance.yahoo.com/'

In [22]:
lnk.text_content()

'Yahoo'

In [23]:
urls = [lnk.get('href') for lnk in doc.findall('.//a')]

In [30]:
urls[180:186]

['http://info.yahoo.com/relevantads/',
 'http://info.yahoo.com/legal/us/yahoo/utos/utos-173.html',
 'https://finance.yahoo.com/sitemap/',
 'http://twitter.com/YahooFinance',
 'http://facebook.com/yahoofinance',
 'http://yahoofinance.tumblr.com']

In [31]:
tables = doc.findall('.//table')

In [34]:
calls = tables[1]

In [35]:
puts = tables[0]

In [36]:
rows = calls.findall('.//tr')

In [38]:
def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]

In [39]:
_unpack(rows[0], kind='th')

['Contract Name',
 'Last Trade Date',
 'Strike',
 'Last Price',
 'Bid',
 'Ask',
 'Change',
 '% Change',
 'Volume',
 'Open Interest',
 'Implied Volatility']

In [40]:
_unpack(rows[1], kind='td')

['AAPL191025P00150000',
 '2019-10-16 3:30PM EDT',
 '150.00',
 '0.01',
 '0.00',
 '0.01',
 '0.00',
 '-',
 '1',
 '85',
 '103.13%']

In [41]:
from pandas.io.parsers import TextParser

In [42]:
def parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    return TextParser(data, names=header).get_chunk()

In [43]:
call_data = parse_options_data(calls)

In [44]:
put_data = parse_options_data(puts)

In [45]:
call_data[:10]

Unnamed: 0,Contract Name,Last Trade Date,Strike,Last Price,Bid,Ask,Change,% Change,Volume,Open Interest,Implied Volatility
0,AAPL191025P00150000,2019-10-16 3:30PM EDT,150.0,0.01,0.0,0.01,0.0,-,1,85,103.13%
1,AAPL191025P00155000,2019-10-09 3:28PM EDT,155.0,0.01,0.0,0.03,0.0,-,12,229,104.69%
2,AAPL191025P00160000,2019-10-10 11:44AM EDT,160.0,0.01,0.0,0.01,-0.03,-75.00%,100,1181,87.50%
3,AAPL191025P00165000,2019-10-17 1:38PM EDT,165.0,0.01,0.0,0.01,0.0,-,24,999,81.25%
4,AAPL191025P00170000,2019-10-17 1:38PM EDT,170.0,0.04,0.0,0.09,0.0,-,53,146,93.75%
5,AAPL191025P00175000,2019-10-18 2:40PM EDT,175.0,0.01,0.0,0.01,-0.01,-50.00%,1,638,68.75%
6,AAPL191025P00180000,2019-10-18 3:37PM EDT,180.0,0.01,0.01,0.05,0.0,-,10,759,75.39%
7,AAPL191025P00182500,2019-10-18 3:36PM EDT,182.5,0.01,0.01,0.01,0.0,-,1,527,64.06%
8,AAPL191025P00185000,2019-10-18 3:59PM EDT,185.0,0.01,0.0,0.01,0.0,-,20,811,57.81%
9,AAPL191025P00187500,2019-10-18 3:59PM EDT,187.5,0.01,0.03,0.03,-0.01,-50.00%,4,727,64.84%


<h3>Parsing XML with lxml.objectify</h3>

In [2]:
from lxml import objectify

In [16]:
path = 'ch06/Performance_MNR.xml'

'ch06/Performance_MNR.xml'

In [17]:
parsed = objectify.parse(open(path))

<lxml.etree._ElementTree at 0x7fc3c3832cc8>

In [18]:
root = parsed.getroot()

<Element INDICATOR at 0x7fc3c31b4948>

In [20]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [21]:
perf = pd.DataFrame(data)

In [22]:
perf

Unnamed: 0,AGENCY_NAME,CATEGORY,DESCRIPTION,FREQUENCY,INDICATOR_NAME,INDICATOR_UNIT,MONTHLY_ACTUAL,MONTHLY_TARGET,PERIOD_MONTH,PERIOD_YEAR,YTD_ACTUAL,YTD_TARGET
0,MEtro-North Railroad,Service Indicators,Percent of the time that escalators are operat...,M,Escalator Availability,%,,97.0,12,2011,,97.0


In [24]:
from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()

In [25]:
root

<Element a at 0x7fc3c2c9cac8>

In [26]:
root.get('href')

'http://www.google.com'

In [27]:
root.text

'Google'

<h2>Binary Data Formats</h2>

In [10]:
frame = pd.read_csv('ch06/ex1.csv')

In [30]:
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [34]:
frame.to_csv('ch06/frame_pickle')

In [36]:
pd.read_csv('ch06/frame_pickle')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


<h3>Using HDF5 Format</h3>

In [8]:
store = pd.HDFStore('mydata.h5')

In [11]:
store['obj1'] = frame

In [12]:
store['obj1_col'] = frame['a']
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [13]:
store['obj1']

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


<h3>Interacting with HTML and Web APIs</h3>

In [14]:
import requests

In [16]:
url = 'https://twitter.com/search?q=Python%20Pandas&src=typed_query'

In [17]:
resp = requests.get(url)

In [18]:
resp

<Response [200]>

In [19]:
import json

In [20]:
data = json.loads(resp.text)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

<h2>Interacting with Databases</h2>

In [21]:
import sqlite3

In [22]:
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL, d INTEGER
);"""

In [23]:
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()

In [24]:
data = [('Atlanta', 'Georgia', 1.25, 6),
('Tallahassee', 'Florida', 2.6, 3),
('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [25]:
cursor = con.execute('select * from test')

In [26]:
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [27]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [29]:
DataFrame(rows, columns=list(zip(*cursor.description))[0])

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [30]:
import pandas.io.sql as sql

In [33]:
sql.read_sql('select * from test', con)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


<h3>Storing and Loading Data in MongoDB</h3>

In [35]:
import pymongo
con = pymongo.MongoClient('localhost', port=27017)

In [36]:
tweets = con.db.tweets

In [37]:
import requests, json
url = 'https://twitter.com/search?q=Python%20Pandas&src=typed_query'
data = json.loads(requests.get(url).text)

for tweet in data['results']:
    tweets.save(tweet)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [38]:
cursor = tweets.find({'from_user': 'wesmckinn'})

In [39]:
tweet_fields = ['created_at', 'from_user', 'id', 'text']
result = DataFrame(list(cursor), columns=tweet_fields)