# CIS600 - Social Media Data Mining 
###  
<img src="https://www.syracuse.edu/wp-content/themes/g6-carbon/img/syracuse-university-seal.svg?ver=6.3.9" style="width: 200px;"/>

# Web Scraping Tools

###  October 9, 2018

**Twitter, cont'd**
Let's complete our look at the Twitter API.

In [1]:
import twitter
from twitter import *

In [2]:
import json
with open('auth_dict','r') as f:
    twtr_auth = json.load(f) # loading from my personal file
    
# To make it more readable, lets store
# the OAuth credentials in strings first.
CONSUMER_KEY = twtr_auth['consumer_key']
CONSUMER_SECRET = twtr_auth['consumer_secret']
OAUTH_TOKEN = twtr_auth['token']
OAUTH_TOKEN_SECRET = twtr_auth['token_secret']
    
# Then, we store the OAuth object in "auth"
auth = OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                           CONSUMER_KEY, CONSUMER_SECRET)

**Twitter Functions**

We can automate the API login.

In [3]:
def oauth_login():
    CONSUMER_KEY = twtr_auth['consumer_key']
    CONSUMER_SECRET = twtr_auth['consumer_secret']
    OAUTH_TOKEN = twtr_auth['token']
    OAUTH_TOKEN_SECRET = twtr_auth['token_secret']
    auth = OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
    CONSUMER_KEY, CONSUMER_SECRET)
    twitter_api = Twitter(auth=auth)
    return twitter_api

Next, we can wrap the lines that give us trends in a function.

In [3]:
def twitter_trends(twitter_api, woe_id):
    # Prefix ID with the underscore for query string parameterization.
    # Without the underscore, the twitter package appends the ID value
    # to the URL itself as a special-case keyword argument.
    return twitter_api.trends.place(_id=woe_id)

Likewise, we define a function for the looped twitter search.

In [4]:
def twitter_search(twitter_api, q, max_results=200, **kw):
    search_results = twitter_api.search.tweets(q=q, count=100, **kw)
    statuses = search_results['statuses']
    # Enforce a reasonable limit
    max_results = min(1000, max_results)
    for _ in range(10): # 10*100 = 1000
        try:
            next_results = search_results['search_metadata']['next_results']
        except KeyError as e: # No more results when next_results doesn't exist
            break
        # Create a dictionary from next_results, which has the following form:
        # ?max_id=313519052523986943&q=NCAA&include_entities=1
        kwargs = dict([ kv.split('=') 
                       for kv in next_results[1:].split("&") ])
        search_results = twitter_api.search.tweets(**kwargs)
        statuses += search_results['statuses']
        if len(statuses) > max_results:
            break
    
    return statuses

You will likely encounter errors in mining Twitter data. Here is a function to automate the handling of certain errors. See *Mining the Social Web* for more details.

In [6]:
import sys
import time
from twitter.api import TwitterHTTPError
from urllib.error import URLError
from http.client import BadStatusLine

def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw):
    # A nested helper function that handles common HTTPErrors. Return an updated
    # value for wait_period if the problem is a 500 level error. Block until the
    # rate limit is reset if it's a rate limiting issue (429 error). Returns None
    # for 401 and 404 errors, which requires special handling by the caller.
    def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
        if wait_period > 3600: # Seconds
            print('Too many retries. Quitting.', file=sys.stderr)
            raise e
        if e.e.code == 401:
            return None
        elif e.e.code == 404:
            print('Encountered 404 Error (Not Found)', file=sys.stderr)
            return None
        elif e.e.code == 429:
            print('Encountered 429 Error (Rate Limit Exceeded)', file=sys.stderr)
            if sleep_when_rate_limited:
                print("Retrying in 15 minutes...ZzZ...", file=sys.stderr)
                sys.stderr.flush()
                time.sleep(60*15 + 5) #Handling API rate limit issues.
                print('...ZzZ...Awake now and trying again.', file=sys.stderr)
                return 2
            else:
                raise e # Caller must handle the rate limiting issue
        elif e.e.code in (500, 502, 503, 504):
            print('Encountered %i Error. Retrying in %i seconds' % (e.e.code, wait_period), file=sys.stderr)
            time.sleep(wait_period)
            wait_period *= 1.5
            return wait_period
        else:
            raise e

    # End of nested helper function

    wait_period = 2
    error_count = 0
    while True:
        try:
            return twitter_api_func(*args, **kw)
        except TwitterHTTPError as e:
            error_count = 0
            wait_period = handle_twitter_http_error(e, wait_period)
            if wait_period is None:
                return
        except URLError as e:
            error_count += 1
            print("URLError encountered. Continuing.", file=sys.stderr)
            if error_count > max_errors:
                print("Too many consecutive errors...bailing out.", file=sys.stderr)
                raise
        except BadStatusLine as e:
            error_count += 1
            print >> sys.stderr, "BadStatusLine encountered. Continuing."
            if error_count > max_errors:
                print("Too many consecutive errors...bailing out.", file=sys.stderr)
                raise

In [4]:
t = oauth_login()

In [7]:
response = make_twitter_request(t.users.lookup, screen_name="SocialWebMining")

In [8]:
response

[{'id': 132373965,
  'id_str': '132373965',
  'name': 'MiningTheSocialWeb',
  'screen_name': 'SocialWebMining',
  'location': '',
  'description': 'Get the source code at GitHub: http://t.co/U0VmWrXpB9',
  'url': 'http://t.co/CJfJDyM6ki',
  'entities': {'url': {'urls': [{'url': 'http://t.co/CJfJDyM6ki',
      'expanded_url': 'http://miningthesocialweb.com',
      'display_url': 'miningthesocialweb.com',
      'indices': [0, 22]}]},
   'description': {'urls': [{'url': 'http://t.co/U0VmWrXpB9',
      'expanded_url': 'http://bit.ly/MiningTheSocialWeb2E',
      'display_url': 'bit.ly/MiningTheSocia…',
      'indices': [31, 53]}]}},
  'protected': False,
  'followers_count': 4339,
  'friends_count': 0,
  'listed_count': 219,
  'created_at': 'Tue Apr 13 02:10:40 +0000 2010',
  'favourites_count': 35,
  'utc_offset': None,
  'time_zone': None,
  'geo_enabled': False,
  'verified': False,
  'statuses_count': 770,
  'lang': 'en',
  'status': {'created_at': 'Mon Aug 17 14:39:50 +0000 2015',
   '

We will want to write responses to disk, on the fly, so that we can collect many observations for later analysis. In *Mining the Social Web*, the Mongo DB database program is recommended. Here is another way (that also can be adapted so that it writes to a database).

First, let's wrap the extraction of tweet "[entities](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object)" in a function which takes a list of statuses as input.

In [9]:
def extract_tweet_entities(statuses):
    if len(statuses) == 0:
        return [], [], [], [], []
    screen_names = [ user_mention['screen_name'] 
                    for status in statuses
                        for user_mention in status['entities']['user_mentions'] ]
    hashtags = [ hashtag['text']
                    for status in statuses
                        for hashtag in status['entities']['hashtags'] ]
    urls = [ url['expanded_url']
                    for status in statuses
                        for url in status['entities']['urls'] ]
    symbols = [ symbol['text']
                    for status in statuses
                        for symbol in status['entities']['symbols'] ]
    if status['entities'].has_key('media'):
        media = [ media['url']
                for status in statuses
                    for media in status['entities']['media'] ]
    else:
        media = []
    return screen_names, hashtags, urls, media, symbols

In [19]:
def extract_tweet_basics(status):
    screen_name = None
    tweet_ID = None
    text = None
    if 'user' in status:
        screen_name = status['user']['screen_name'] 
        tweet_ID = str(status['id'])
        text = status['text']
    return screen_name, tweet_ID, text

Finally, we stick this in the streaming loop from last time.

In [20]:
def tweet_to_csv(file, status):
    screen_name, tweet_ID, text = extract_tweet_basics(status)
    df = pd.DataFrame([[screen_name,tweet_ID,text]], columns=['screen_name','tweet_ID','text'])
    with open(file, 'a') as f:
        df.to_csv(f,header=False, index=False)

In [21]:
# Create a *streaming* connection (not RESTful, different from Search).
t_stream = TwitterStream(auth=auth)


# Get an *iterator* object from the twitter wrapper

tweeterator = t_stream.statuses.sample()

# Create a CSV file with column names
# but no data (yet).
import pandas as pd
df = pd.DataFrame(columns=['screen_name','tweet_ID','text'])
df.to_csv('my_csv.csv', index=False)


# The loop below will grab a new tweet,
# extract some basic info, put that info
# in a dataframe object, then use that
# dataframe object to append one row to
# the existing CSV file, 'my_csv.csv'.

tweet_count = 100
for tweet in tweeterator:
    tweet_count -= 1
    tweet_to_csv('my_csv.csv', tweet)  
    if tweet_count <= 0:
        break 

In [22]:
df = pd.read_csv('my_csv.csv')
df.head()

Unnamed: 0,screen_name,tweet_ID,text
0,Rochista9,1.049662e+18,@Pablidzic Pues yo la gente de izquierda que c...
1,fenrirdenpadata,1.049662e+18,RT @phenixsaber: 通常人が想定するエロ(敢えて、エロ画像を張る)、自称フェミ...
2,Sanjiro_ni,1.049662e+18,うわ…潮江先輩の声が聞こえる…うるさいなあ
3,taka0109da,1.049662e+18,RT @inoueyusuke: 品川駅の改札の外で、サラリーマンの彼氏の帰りを待つ彼女。\...
4,UltraDz1,1.049662e+18,RT @DemahomTube: @charlieINTEL Black Ops 4 des...


**Network Structure**

Below we'll develop some tools for crawling the friendship graph of some Twitter followers. This exercise is taken directly from *Mining the Social Web*.

In [24]:
# This will let us create new partial
# functions with arguments set to 
# certain values.
from functools import partial

# This was maxint.
# There is no longer a maxint (in Python 3)
from sys import maxsize


def get_friends_followers_ids(twitter_api, screen_name=None, user_id=None,
                                friends_limit=maxsize, followers_limit=maxsize):
    # Must have either screen_name or user_id (logical xor)
    assert (screen_name != None) != (user_id != None), \
    "Must have screen_name or user_id, but not both"
    
    # You can also do this with a function closure.
    get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids,
                                count=5000)
    get_followers_ids = partial(make_twitter_request, twitter_api.followers.ids,
                                count=5000)
    friends_ids, followers_ids = [], []
    for twitter_api_func, limit, ids, label in [
            [get_friends_ids, friends_limit, friends_ids, "friends"],
            [get_followers_ids, followers_limit, followers_ids, "followers"]
            ]:
        #LOOK HERE! This little line is important.
        if limit == 0: continue
        cursor = -1
        while cursor != 0:
            # Use make_twitter_request via the partially bound callable...
            if screen_name:
                response = twitter_api_func(screen_name=screen_name, cursor=cursor)
            else: # user_id
                response = twitter_api_func(user_id=user_id, cursor=cursor)
            if response is not None:
                ids += response['ids']
                cursor = response['next_cursor']
            print('Fetched {0} total {1} ids for {2}'.format(len(ids),
                    label, (user_id or screen_name), file=sys.stderr))
            if len(ids) >= limit or response is None:
                break
    # Do something useful with the IDs, like store them to disk...
    return friends_ids[:friends_limit], followers_ids[:followers_limit]

In [25]:
friends_ids, followers_ids = get_friends_followers_ids(t,
                                screen_name="ZedShaw",
                                friends_limit=10,
                                followers_limit=10)
print(friends_ids)

print(followers_ids)

Fetched 957 total friends ids for ZedShaw
Fetched 5000 total followers ids for ZedShaw
[193322834, 56582285, 190364901, 52551600, 61391304, 259379883, 9038902, 768486347300626432, 22455722, 402502573]
[3773329635, 41868331, 82216927, 2987896214, 906191357311541248, 417898030, 47502725, 1000090889350516736, 1840251, 596716120]


In [28]:
# Create a mostly empty data frame,
# and write it to a CSV file.
df = pd.DataFrame(columns=['ID','followers'])
df.to_csv('followers.csv', index=False)

# Our function
def save_followers(fid, followers):
    df = pd.DataFrame([[fid, followers]], columns=['ID','followers'])
    with open('followers.csv', 'a') as f:
        df.to_csv(f,header=False, index=False)

Finally, let's implement a (somewhat limited) BFS of the followers graph.

In [27]:
def crawl_followers(twitter_api, screen_name, limit=1000000, depth=2):
    
    # Resolve the ID for screen_name and start working with IDs for consistency
    seed_id = str(twitter_api.users.show(screen_name=screen_name)['id'])
    _, next_queue = get_friends_followers_ids(twitter_api, user_id=seed_id,
                        friends_limit=0, followers_limit=limit)
    
    # Store a seed_id => _follower_ids mapping in MongoDB
    save_followers(seed_id, ','.join([str(x) for x in next_queue]))
    
    d = 1
    # Note that in the example in the next cell,
    # we never enter this loop.
    while d < depth:
        d += 1
        # Reset the next_queue so that we can
        # start building up the next level
        # of followers-of-followers
        (queue, next_queue) = (next_queue, [])
        # Loop through the current
        # level of followers
        for fid in queue:
            _, follower_ids = get_friends_followers_ids(twitter_api, user_id=fid,
                                friends_limit=0, followers_limit=limit)
            # Store an ID with a string recording
            # IDs of followers of the user with ID "fid"
            save_followers(str(fid), ','.join([str(x) for x in follower_ids]))
            # Extending the list
            next_queue += follower_ids


In [29]:
crawl_followers(t,'ZedShaw')

Fetched 5000 total followers ids for 15029296
Fetched 10000 total followers ids for 15029296
Fetched 15000 total followers ids for 15029296
Fetched 17384 total followers ids for 15029296
Fetched 5000 total followers ids for 3773329635
Fetched 10000 total followers ids for 3773329635
Fetched 15000 total followers ids for 3773329635
Fetched 20000 total followers ids for 3773329635
Fetched 25000 total followers ids for 3773329635
Fetched 30000 total followers ids for 3773329635
Fetched 35000 total followers ids for 3773329635
Fetched 40000 total followers ids for 3773329635
Fetched 45000 total followers ids for 3773329635
Fetched 50000 total followers ids for 3773329635


Encountered 429 Error (Rate Limit Exceeded)
Retrying in 15 minutes...ZzZ...


KeyboardInterrupt: 

You can read about all of this in more detail in *Mining the Social Web*. Take care to translate from Python 2 into Python 3 if you are using Python 3 (as I have here). See if you can find the bug in the Python 2 code for `crawl_followers`.

In [2]:
dF = pd.read_csv('followers.csv')

In [3]:
dF.head()

Unnamed: 0,ID,followers
0,15029296,"3773329635,41868331,82216927,2987896214,906191..."


**Scraping**

Here are some versatile tools for pulling down information from the Web. Also, we will have a look at some other data sources that cannot really be counted as *social media* per se.

For starters, we have mature tools for parsing HTML (XML broadly speaking). Here is **lxml**.

In [1]:
# The yoozhe
import pandas as pd
import numpy as np

In [4]:
# Since we are going to grab some html
import lxml.html as lh

# Simply way to make HTTP requests
import requests

URL = "https://en.wikipedia.org/wiki/HTML"
r = requests.get(URL)

Next, we take the response and turn it into an *element tree*.

In [5]:
tree   = lh.fromstring(r.text)

We can now get information by using the structure of the HTML

In [6]:
tree.findtext('head/title')

'HTML - Wikipedia'

Furthermore, we can explore the structure.

In [8]:
childs = tree.xpath('child::*')
childs

[<Element head at 0x7f039b921b38>, <Element body at 0x7f039b921cc8>]

The second child is the body of the document.

In [9]:

body = childs[1]

body.attrib['class']

'mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-HTML rootpage-HTML skin-vector action-view'

Let's look for all div elements.

In [10]:
divList = body.xpath('div')

I wonder what is in them.

In [11]:
divList[0].values()

['mw-page-base', 'noprint']

And how many?

In [12]:
len(divList)

5

Not that many, we must have meant *find all of them*:

In [13]:
allDivs = body.xpath('//div')

In [14]:
len(allDivs)

174

I wonder whether there are any interesting tables.

In [15]:
allTables = body.xpath('//table')

In [16]:
len(allTables)

18

<img src="https://i.imgur.com/QQeZcZL.gif" style="width: 200px;"/>



Let's see the first.

In [17]:
table = allTables[0]
table.values()

['infobox', 'width:22em']

That is not much information. What else is in there?

In [21]:
len(table[1].xpath('tr'))

13

So there is a table with many rows

There aren't any structured datasets available in this page (just inspect). OK. Let's try another page from which we can pull some data.

In [22]:
URL = "https://en.wikipedia.org/wiki/Python_(programming_language)"
p = requests.get(URL)

ptree = lh.fromstring(p.text)

Looking at the source, we know what we are looking for. We can select by attribute values.

In [23]:
pTables = ptree.xpath("//table[@class='wikitable']")
len(pTables)

1

Let's see what we've got here.

In [24]:
chart = pTables[0]
chart.xpath('child::*')

[<Element caption at 0x7f039b92def8>, <Element tbody at 0x7f039b92df98>]

In [25]:
lastBody = chart[1]

In [26]:
len(lastBody.xpath('tr'))

14

**Exercise** Let's build a `DataFrame` that represents the data in the table stored (and parsed as a tree) in `lastBody`.

In [27]:
toNP = [[x.text_content().strip() for x in r.xpath('th')] for r in lastBody.xpath('tr') ]
toNP

[['Type', 'mutable', 'Description', 'Syntax example'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

OK, that's not exactly what we expected, but it got the first row, so we can go ahead and use that for columns

In [29]:
tableCols = toNP[0]
tableCols

['Type', 'mutable', 'Description', 'Syntax example']

But how to get the remaining data? Let's redo that list comprehension. There is no way around it; we need a helper function.

In [31]:
def tableHelper(node):
    if len(node.xpath('code')) > 0:
        codes = node.xpath('code')
        txtCodes = ' '.join([x.text_content().strip() for x in codes])
        return txtCodes
    else:
        return node.text_content().strip()

Note the changes below. We are lookin in different tags (`td`) because that is where the data are. We have inserted our helper function in order to process the nodes properly.

In [32]:
toNP = [[tableHelper(x) for x in r.xpath('td')] for r in lastBody.xpath('tr')[1:] ]
toNP

[['bool', 'immutable', 'Boolean value', 'True False'],
 ['bytearray',
  'mutable',
  'Sequence of bytes',
  'bytearray(b\'Some ASCII\') bytearray(b"Some ASCII") bytearray([119, 105, 107, 105])'],
 ['bytes',
  'immutable',
  'Sequence of bytes',
  'b\'Some ASCII\' b"Some ASCII" bytes([119, 105, 107, 105])'],
 ['complex',
  'immutable',
  'Complex number with real and imaginary parts',
  '3+2.7j'],
 ['dict',
  'mutable',
  'Associative array (or dictionary) of key and value pairs; can contain mixed types (keys and values), keys must be a hashable type',
  "{'key1': 1.0, 3: False}"],
 ['ellipsis',
  '',
  'An ellipsis placeholder to be used as an index in NumPy arrays',
  '...'],
 ['float',
  'immutable',
  'Floating point number, system-defined precision',
  '3.1415927'],
 ['frozenset',
  'immutable',
  'Unordered set, contains no duplicates; can contain mixed types, if hashable',
  "frozenset([4.0, 'string', True])"],
 ['int', 'immutable', 'Integer of unlimited magnitude[80]', '42'],
 [

Note that this is imperfect, but did we at least get the basic structure right?

In [33]:
len(toNP), [len(x) for x in toNP], len([len(x) for x in toNP])

(13, [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], 13)

Yes, everything has the right shape. Now let's build an `array` of our data.

In [34]:
tableData = np.array(toNP)
tableData.shape

(13, 4)

Good, now let's build `DataFrame`.

In [35]:
tableFrame = pd.DataFrame(data=tableData, columns=tableCols)
tableFrame

Unnamed: 0,Type,mutable,Description,Syntax example
0,bool,immutable,Boolean value,True False
1,bytearray,mutable,Sequence of bytes,"bytearray(b'Some ASCII') bytearray(b""Some ASCI..."
2,bytes,immutable,Sequence of bytes,"b'Some ASCII' b""Some ASCII"" bytes([119, 105, 1..."
3,complex,immutable,Complex number with real and imaginary parts,3+2.7j
4,dict,mutable,Associative array (or dictionary) of key and v...,"{'key1': 1.0, 3: False}"
5,ellipsis,,An ellipsis placeholder to be used as an index...,...
6,float,immutable,"Floating point number, system-defined precision",3.1415927
7,frozenset,immutable,"Unordered set, contains no duplicates; can con...","frozenset([4.0, 'string', True])"
8,int,immutable,Integer of unlimited magnitude[80],42
9,list,mutable,"List, can contain mixed types","[4.0, 'string', True]"


There is of course, room for improvement. Generally, when engaged in a scraping endeavor you will be interested in a specific kind of element in a specific kind of web page and therefore you will specialize your processing steps in order to clean things up. This is necessary; no amount of 'wrapping' or elegance will free us from having to express what we mean. That said, let's do some wrapping and also make use of some pre-packaged wrapping!

Below, I am going to put everything into a single function `getFrame`. My function *takes a string* and it *returns a `DataFrame`*.

In [36]:
# Copied from above for completeness of this cell.
def tableHelper(node):
    if len(node.xpath('code')) > 0:
        codes = node.xpath('code')
        txtCodes = ' '.join([x.text_content().strip() for x in codes])
        return txtCodes
    else:
        return node.text_content().strip()

def getFrame(name):
    """ Give me your thing
    that you want me to 
    Wikipedia and I will
    try to extract the first
    table from it."""
    # Request and results
    base = "https://en.wikipedia.org/wiki/"
    URL = base+name
    r = requests.get(URL)
    # Prepare my element tree structure
    tree   = lh.fromstring(r.text)
    
    # Finding the table
    pTables = tree.xpath("//table[@class='wikitable']")
    Body = pTables[0].xpath('tbody')[0]

    # Extracting the data
    toNP = [[tableHelper(x) for x in r.xpath('td')] for r in Body.xpath('tr')[1:] ]
    tableData = np.array(toNP)
    
    # Extracting the column names
    header = Body.xpath('tr')[0]
    tableCols = [x.text_content().strip() for x in header.xpath('th')]
    
    # Building the DataFrame
    tableFrame = pd.DataFrame(data=tableData, columns=tableCols)
    
    return tableFrame

Let's try it out.

In [37]:
newFrame = getFrame('porsche')
newFrame

Unnamed: 0,Year ending,Revenue,Pre-tax profit,Production,Sales
0,31 July 2002,"€4,857m",€829m,55050.0,54234
1,31 July 2003,"€5,583m",€933m,73284.0,66803
2,31 July 2004,"€6,148m","€1,137m",81531.0,76827
3,31 July 2005,"€6,574m","€1,238m",90954.0,88379
4,31 July 2006,"€7,273m","€2,110m",102602.0,96794
5,31 July 2007,"€7,368m","€5,857m",101844.0,97515
6,31 July 2008,"€7,466m","€8,569m",105162.0,98652
7,31 July 2009,€?m,"€-2,559m",76739.0,75238
8,31 July 2010,€7.79b,,89123.0,81850
9,31 December 2010,€9.23b,€1.67b[39],,97273


**Exercise** (for the reader)
1. Make this function more robust against errors/incompatible pages
2. Make this function more fully-featured (w.r.t input)
3. Make this function more refined (w.r.t output)

There are existing tools for parsing web pages. Let's have a look at *Beautiful Soup*, named for *tag soup* - a term of endearment for messy HTML or XML. The `lxml` [documentation](https://lxml.de/) mentions Beautiful Soup as an alternative for handling *really broken* HTML documents.

In [38]:
# Example from the Wikipedia page on BS4
# Note the use of the native urllib
from bs4 import BeautifulSoup
import urllib.request

with urllib.request.urlopen('https://en.wikipedia.org/wiki/Main_Page') as response:
    webpage = response.read()
    soup = BeautifulSoup(webpage, 'html.parser')
    for anchor in soup.find_all('a'):
        print(anchor.get('href', '/'))

/
#mw-head
#p-search
/wiki/Wikipedia
/wiki/Free_content
/wiki/Encyclopedia
/wiki/Wikipedia:Introduction
/wiki/Special:Statistics
/wiki/English_language
/wiki/Portal:Arts
/wiki/Portal:Biography
/wiki/Portal:Geography
/wiki/Portal:History
/wiki/Portal:Mathematics
/wiki/Portal:Science
/wiki/Portal:Society
/wiki/Portal:Technology
/wiki/Portal:Contents/Portals
/wiki/File:Camille_Saint-Sa%C3%ABns_in_1900_by_Pierre_Petit.jpg
/wiki/Camille_Saint-Sa%C3%ABns
/wiki/Romantic_music
/wiki/Danse_macabre_(Saint-Sa%C3%ABns)
/wiki/Samson_and_Delilah_(opera)
/wiki/Symphony_No._3_(Saint-Sa%C3%ABns)
/wiki/The_Carnival_of_the_Animals
/wiki/Paris_Conservatoire
/wiki/Saint-Merri
/wiki/La_Madeleine,_Paris
/wiki/Second_French_Empire
/wiki/Robert_Schumann
/wiki/Franz_Liszt
/wiki/Richard_Wagner
/wiki/Impressionism_in_music
/wiki/Dodecaphonic
/wiki/Camille_Saint-Sa%C3%ABns
/wiki/Mark_Oliphant
/wiki/Francis_Nash
/wiki/Tree_swallow
/wiki/Wikipedia:Today%27s_featured_article/October_2018
https://lists.wikimedia.org/m

Let's try to replicate what we did above, this time using `bs4`. The setup is the same.

In [39]:
base = "https://en.wikipedia.org/wiki/" 
URL = base+"Python_(programming_language)"
r = requests.get(URL)

Next, we parse the source.

In [40]:
soup = BeautifulSoup(r.text)
table = soup.findAll('table',{'class':'wikitable'})[0]

Actually, the rest of this goes pretty much as before.

In [41]:
# Illustration of the methods we're using
header = table.findAll('th')[0]
header.text.strip()

'Type'

Let's recreate that list comprehension from before.

In [42]:
toNP = [[x.text.strip() for x in r.findAll('th')] for r in table.findAll('tr') ]
tableCols = toNP[0]

We should probably rewrite `tableHelper`

In [43]:
def tableHelper(node):
    if len(node.findAll('code')) > 0:
        codes = node.findAll('code')
        txtCodes = ' '.join([x.text.strip() for x in codes])
        return txtCodes
    else:
        return node.text.strip()

In [44]:
toNP = [[tableHelper(x) for x in r.findAll('td')] for r in table.findAll('tr')[1:] ]

The rest is as before.

In [45]:
tableData = np.array(toNP)
tableFrame = pd.DataFrame(data=tableData, columns=tableCols)
tableFrame.head()

Unnamed: 0,Type,mutable,Description,Syntax example
0,bool,immutable,Boolean value,True False
1,bytearray,mutable,Sequence of bytes,"bytearray(b'Some ASCII') bytearray(b""Some ASCI..."
2,bytes,immutable,Sequence of bytes,"b'Some ASCII' b""Some ASCII"" bytes([119, 105, 1..."
3,complex,immutable,Complex number with real and imaginary parts,3+2.7j
4,dict,mutable,Associative array (or dictionary) of key and v...,"{'key1': 1.0, 3: False}"


Here is a brief illustration of the options in parsing HTML responses

In [46]:
doc1 = "<a><b /></a>"
doc2 = "<a></p>"

We can use `bs4` in a few different ways. Here we parse `doc1` as HTML

In [47]:
BeautifulSoup(doc1)

<html><body><a><b></b></a></body></html>

Here we parse the same document as XML.

In [48]:
BeautifulSoup(doc1,'xml')

<?xml version="1.0" encoding="utf-8"?>
<a><b/></a>

Here we use the `lxml` library to parse the second, invalid document.

In [49]:
BeautifulSoup(doc2, 'lxml')

<html><body><a></a></body></html>

The `lxml` library decided to simple drop the dangling end tag. What will the `html5lib` library do?

In [50]:
BeautifulSoup(doc2,'html5lib')

<html><head></head><body><a><p></p></a></body></html>

Instead, this one filled in the missing start tag.

**REGEX**

Let's have a brief look at *regular expressions*. This is another tool that will serve you well in wrangling real-world data, particularly text data. This discussion of scraping is as good a place as any for a review of regexes.

In [51]:
tedURL = "https://www.washingtonpost.com/wp-srv/national/longterm/unabomber/manifesto.text.htm?noredirect=on"
tedMan = requests.get(tedURL)
tedParags =  [x.text_content().strip() for x in lh.fromstring(tedMan.text).xpath('//p')]
tedText = 'PARAG'.join(tedParags)

We didn't really need that last part, it was just for fun. Now we have a really long string. Let's look for patterns in it.

>A regular expression (or RE) specifies a set of strings that matches it

See the rest of the documentation [here](https://docs.python.org/3/library/re.html). Let's look at some examples. We inserted those "PARAG" strings. What follows them?

This example below uses a *lookahead* pattern - we don't extract the "PARAG", but the thing that follows it.

In [52]:
import re
parags = re.findall('(?<=PARAG).*',tedText)
parags

["Editor's Note: This is the text of a 35,000-word manifesto as submitted to The ",
 'Return to our special report.PARAGPARAGPARAGIntroductionPARAG1. The Industrial Revolution and its consequences have been a disaster for the human race. They have ',
 '2. The industrial-technological system may survive or it may break down. If it survives, it MAY eventually ',
 '3. If the system breaks down the consequences will still be very painful. But the bigger the system grows the ',
 '4. We therefore advocate a revolution against the industrial system. This revolution may or may not make ',
 '5. In this article we give attention to only some of the negative developments that have grown out of the ',
 'THE PSYCHOLOGY OF MODERN LEFTISMPARAG6. Almost everyone will agree that we live in a deeply troubled society. One of the most widespread ',
 '7. But what is leftism? During the first half of the 20th century leftism could have been practically identified ',
 '8. Even so, our conception of leftism w

That's pretty close to what I wanted. Note that most of these are paragraphs with the author's original numbering.

In [None]:
parags[15]

Why does it just stop right there? From the documentation:

> `.` matches any character except a newline.

That explains it. So what if we wanted just the first word following a "PARAG"? We could do, for instance

In [53]:
import re
parags = re.findall('(?<=PARAG)\S*',tedText)
parags

["Editor's",
 'Return',
 'PARAGPARAGIntroductionPARAG1.',
 '2.',
 '3.',
 '4.',
 '5.',
 'THE',
 '6.',
 '7.',
 '8.',
 '9.',
 'FEELINGS',
 '10.',
 '11.',
 '12.',
 '13.',
 '14.',
 '15.',
 '16.',
 '17.',
 '18.',
 '19.',
 '20.',
 '21.',
 '22.',
 '23.',
 'OVERSOCIALIZATIONPARAG24.',
 '25.',
 '26.',
 '27.',
 '28.',
 '29.',
 '30.',
 '31.',
 '32.',
 'THE',
 '33.',
 '34.',
 '35.',
 '36.',
 '37,',
 'SURROGATE',
 '38.',
 '39.',
 '40.',
 '41.',
 'AUTONOMYPARAG42.',
 '43.',
 '44.',
 'SOURCES',
 '45.',
 '46.',
 '47.',
 '48.',
 '49.',
 '50.',
 '51.',
 '52.',
 '53.',
 '54.',
 '55.',
 '56.',
 '57.',
 '58.',
 'DISRUPTION',
 '59.',
 '60.',
 '61.',
 '62.',
 '63.',
 '64.',
 '65.',
 '66.',
 '67.',
 '68.',
 '69.',
 '70.',
 '71.',
 '72.',
 '73.',
 '74.',
 '75.',
 '76.',
 'HOW',
 '77.',
 '78.',
 '79.',
 '80.',
 '81.',
 '82.',
 '83.',
 '84.',
 '85.',
 '86.',
 'THE',
 '87.',
 '88.',
 '89.',
 '90.',
 '91.',
 '92.',
 'THE',
 '93.',
 '94.',
 '95.',
 '96.',
 '97.',
 '98.',
 'SOME',
 '99.',
 '100.',
 '101.',
 '102.',
 

Not bad. What about the next couple of words? (**exercise**). There is (almost) no limit to what you can express with REs.

**Scrapy**

Let's look at a tool for automating web scraping.
>Scrapy is an application framework for crawling web sites and extracting structured data which can be used for a wide range of useful applications, like data mining, information processing or historical archival.

Let's step through the tutorial.

1. Create a new terminal in Jupyter (or otherwise open a new terminal)
2. Enter `scrapy startproject tutorial` (or another name for your tutorial project)
3. Copy or type the below code into a text file that we'll call `quotes_spider.py`. Create that file in `tutorial/tutorial/spiders`

In [51]:
# Class definition for your first scrapy spider
import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename) # old-fashioned string parsing

Remarks on these definitions:
* `name` identifies the Spider. It must be unique within a project, that is, you can’t set the same name for different Spiders.

* `start_requests()` must return an iterable of Requests (you can return a list of requests or write a generator function) which the Spider will begin to crawl from. Subsequent requests will be generated successively from these initial requests.

* `parse()` a method that will be called to handle the response downloaded for each of the requests made. The response parameter is an instance of TextResponse that holds the page content and has further helpful methods to handle it.
The `parse()` method usually parses the response, extracting the scraped data as dicts and also finding new URLs to follow and creating new requests (`Request`) from them.

4. Go to the top-level directory (`tutorial`) and run `scrapy crawl quotes`.
5. Look around (`ls`) the directory. Examine the new files.

What you'll notice about this example is that we really didn't do any parsing. Let's fix that by updating our spider. Replace the parse method definition in `quotes_spider.py` with the new one below:

(*Remark* you can use the scrapy shell to play with the methods used below: `scrapy shell "some_url.com"`)

In [None]:
def parse(self, response):
    for quote in response.css('div.quote'):
        yield {
            'text': quote.css('span.text::text').extract_first(),
            'author': quote.css('small.author::text').extract_first(),
            'tags': quote.css('div.tags a.tag::text').extract(),
        }


Finally, run the command `scrapy crawl quotes -o quotes.json` to extract data from these pages and store the parsed results in the file `quotes.json`.

Then take a look (e.g. `cat quotes.json`).

We are sort of scraping, but there is no *crawling* to speak of. What does that mean, crawling? Let's see how to follow links.

Replace the contents of `quotes_spider.py` with the code below. Note that this one also uses the `start_urls` shortcut (which you can read about on the tutorial page).

In [53]:
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('small.author::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

Notice what has changed: we have extracted links and included logic to move to the next one in the list. More precisely...
>Now, after extracting the data, the `parse()` method looks for the link to the next page, builds a full absolute URL using the `urljoin()` method (since the links can be relative) and yields a new request to the next page, registering itself as callback to handle the data extraction for the next page and to keep the crawling going through all the pages.

What else can you do? Here is one example of a slightly more advanced spider that scrapes author information:

In [None]:
import scrapy


class AuthorSpider(scrapy.Spider):
    name = 'author'

    start_urls = ['http://quotes.toscrape.com/']

    def parse(self, response):
        # follow links to author pages
        for href in response.css('.author + a::attr(href)'):
            yield response.follow(href, self.parse_author)

        # follow pagination links
        for href in response.css('li.next a::attr(href)'):
            yield response.follow(href, self.parse)

    def parse_author(self, response):
        def extract_with_css(query):
            return response.css(query).extract_first().strip()

        yield {
            'name': extract_with_css('h3.author-title::text'),
            'birthdate': extract_with_css('.author-born-date::text'),
            'bio': extract_with_css('.author-description::text'),
        }

From the tutorial:
>This spider will start from the main page, it will follow all the links to the authors pages calling the `parse_author` callback for each of them, and also the pagination links with the `parse` callback as we saw before.