# Mining The Social Web 

## http://chimera.labs.oreilly.com/books/1234000001583/ch01.html

### Extract data from pdf

* tabula: extract data from pdf http://tabula.technology/
* pdftotext: use on linux


### Extract data from website

In [40]:
import urllib.request, time, re, random, hashlib
from bs4 import BeautifulSoup 
import string
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
from itertools import combinations
from scipy.cluster import hierarchy
from scipy.spatial import distance
%matplotlib osx


In [41]:
# IMPORTANT:  PLEASE USE THE FOLLOWING fetch(url) TO LOAD ALL YOUR 
# WEBPAGES. PLEASE DO NOT DIRECTLY LOAD PAGES. THIS WILL ENSURE THAT
# PAGES ARE CACHED AS FILES IN YOUR DIRECTORY, AND AVOID UNNECESSARY
# LOAD ON WEBSITES.  ALSO WHEN PAGES ARE ACTUALLY LOADED, THE REQUESTS
# ARE STAGGERED AS EXPECTED OF HUMAN BROWSING.

# Compassionate Caching inspired by 
# http://lethain.com/an-introduction-to-compassionate-screenscraping/

last_fetched_at = None

def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
        return result

In [42]:
class Entity(object):
    def __init__(self, name):
        self.name = name
        self.words = None
        self.vector = None

In [43]:
def get_search_results(entity):
    """Return an html with search results for given entity."""  
    name = entity.name
    url_encoded_name = name.replace(' ', '%20')
    result = fetch('http://www.usatoday.com'+ '/search/' + url_encoded_name + '/')
    return result

In [44]:
def get_articles(results_html, n=3):
    """Return a list of article htmls for given search results html."""
    soupify_result = BeautifulSoup(results_html, 'html.parser')
    link_results = soupify_result.find_all('a', attrs={'class':'search-result-item-link'})
    domain = 'http://www.usatoday.com'
    articles = []
    num_result = 0
    for link in link_results:
        # this excludes any video, audio results
        if (link['href'].startswith('/story') and num_result < n):
            # the links are relevant links, convert them to absolute links
            article = fetch(domain+link['href'])
            articles.append(article)
            num_result += 1
    return articles

In [45]:
from nltk.corpus import stopwords
from nltk import word_tokenize

def get_words(articles):
    """Return list of representative words from a list of article htmls."""
    bag_of_words = []
    for article in articles:
        soupify_article = BeautifulSoup(article,'html.parser')
        paragraphs = soupify_article.find_all('p',attrs={'class':None})
        for p in paragraphs:
            if p.parent.name != 'a':
                words = word_tokenize(p.text)
                filtered_words = [w.lower() for w in words if w.isalpha()]
                filtered_stop_words = [w for w in filtered_words if w not in stopwords.words('english')]
                bag_of_words += filtered_stop_words
        
    bag_of_words = set(bag_of_words)
    return bag_of_words

In [46]:
def get_bag_of_words(entity):
    results = get_search_results(entity)
    articles = get_articles(results,3)
    return get_words(articles)

In [47]:
entity = Entity('hillary clinton')
entity.words = get_bag_of_words(entity)
print(entity.words)

Loading: http://www.usatoday.com/search/hillary%20clinton/
Loading: http://www.usatoday.com/story/news/politics/elections/2016/2016/02/02/cruz-topples-trump-clinton-sanders-await-final-tally/79685690/
Loading: http://www.usatoday.com/story/news/politics/onpolitics/2016/02/02/hillary-clinton-new-hampshire-iowa-democrats/79700614/
Loading: http://www.usatoday.com/story/news/politics/elections/2016/2016/02/03/young-supporters-drive-sanders-virtual-tie-clinton/79739492/
{'jeffrey', 'folks', 'laughter', 'loss', 'respective', 'registering', 'since', 'showed', 'city', 'helped', 'won', 'theme', 'declined', 'worked', 'focus', 'ames', 'reminded', 'economic', 'slimmest', 'embarrassment', 'volunteers', 'nashua', 'photo', 'juggernaut', 'questions', 'revolution', 'age', 'separating', 'number', 'polling', 'cedar', 'maryland', 'move', 'strong', 'error', 'described', 'ago', 'thus', 'willing', 'vote', 'jubilant', 'reflection', 'big', 'total', 'together', 'introduced', 'sound', 'worth', 'political', 'res

### Use local file

### Twitter Data Example

In [4]:
import twitter

CONSUMER_KEY = 
CONSUMER_SECRET = 
OAUTH_TOKEN = 
OAUTH_TOKEN_SECRET = 

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                           CONSUMER_KEY, CONSUMER_SECRET)

twitter_api = twitter.Twitter(auth=auth)

print(twitter_api)

<twitter.api.Twitter object at 0x10432a208>


In [10]:
q = '#humanservices'
search_results = twitter_api.search.tweets(q=q, count=100)
statuses = search_results['statuses']

In [33]:
from nltk.corpus import stopwords

status_texts = [ status['text'] 
                 for status in statuses ]

screen_names = [ user_mention['screen_name'] 
                 for status in statuses
                     for user_mention in status['entities']['user_mentions'] ]

hashtags = [ hashtag['text'] 
             for status in statuses
                 for hashtag in status['entities']['hashtags'] ]

# Compute a collection of all words from all tweets
words = [ w 
          for t in status_texts 
              for w in t.split() if w.lower() not in stopwords.words('english') and w != "RT" and w != '&amp']
         

In [36]:
from collections import Counter

for item in [words, screen_names, hashtags]:
    c = Counter(item)
    print(c.most_common()[:20]) # top 10

[('#humanservices', 37), ('#HumanServices', 33), ('@AccenturePubSvc:', 12), ('&amp;', 10), ('families', 8), ('leadership', 7), ('outcomes?', 5), ('digital', 5), ('leaders', 5), ('turn', 5), ('#childsupport', 5), ('Read', 5), ('https:…', 5), ('Check', 5), ('ideas', 5), ('questions', 4), ('@unitedwaychi', 4), ('w/', 4), ('State', 4), ('new', 4)]
[('AccenturePubSvc', 12), ('unitedwaychi', 4), ('MsToya1913', 3), ('SDHumanServices', 2), ('UrbanCollegeBos', 1), ('Ostendorff', 1), ('DebMatheny', 1), ('MetroFamChicago', 1), ('SalArmyTampa', 1), ('UNG_News', 1), ('wiscjobs', 1), ('StevensonU', 1), ('SantaMonicaPD', 1), ('santamonicacity', 1), ('Bevhillsyeg', 1), ('johnkeypm', 1), ('RANDCorporation', 1), ('LETUHistory', 1), ('Jamie_Post', 1), ('amazon', 1)]
[('humanservices', 41), ('HumanServices', 35), ('analytics', 6), ('childsupport', 5), ('Illinois', 4), ('health', 4), ('career', 3), ('1711FNDN', 3), ('1711humanservices', 3), ('director', 3), ('Analytics', 3), ('DevelopmentalDisabilities', 3

In [37]:
from prettytable import PrettyTable

for label, data in (('Word', words), 
                    ('Screen Name', screen_names), 
                    ('Hashtag', hashtags)):
    pt = PrettyTable(field_names=[label, 'Count']) 
    c = Counter(data)
    [ pt.add_row(kv) for kv in c.most_common()[:20]]
    pt.align[label], pt.align['Count'] = 'l', 'r' # Set column alignment
    print(pt)

+-------------------+-------+
| Word              | Count |
+-------------------+-------+
| #humanservices    |    37 |
| #HumanServices    |    33 |
| @AccenturePubSvc: |    12 |
| &amp;             |    10 |
| families          |     8 |
| leadership        |     7 |
| outcomes?         |     5 |
| digital           |     5 |
| leaders           |     5 |
| turn              |     5 |
| #childsupport     |     5 |
| Read              |     5 |
| https:…           |     5 |
| Check             |     5 |
| ideas             |     5 |
| questions         |     4 |
| @unitedwaychi     |     4 |
| w/                |     4 |
| State             |     4 |
| new               |     4 |
+-------------------+-------+
+-----------------+-------+
| Screen Name     | Count |
+-----------------+-------+
| AccenturePubSvc |    12 |
| unitedwaychi    |     4 |
| MsToya1913      |     3 |
| SDHumanServices |     2 |
| UrbanCollegeBos |     1 |
| Ostendorff      |     1 |
| DebMatheny      |     1 |


In [39]:
retweets = [
            # Store out a tuple of these three values ...
            (status['retweet_count'], 
             status['retweeted_status']['user']['screen_name'],
             status['text']) 
            
            # ... for each status ...
            for status in statuses 
            
            # ... so long as the status meets this condition.
                if 'retweeted_status' in status
           ]

# Slice off the first 5 from the sorted results and display each item in the tuple

pt = PrettyTable(field_names=['Count', 'Screen Name', 'Text'])
[ pt.add_row(row) for row in sorted(retweets, reverse=True)[:5] ]
pt.max_width['Text'] = 50
pt.align= 'l'
print(pt)

+-------+-----------------+----------------------------------------------------+
| Count | Screen Name     | Text                                               |
+-------+-----------------+----------------------------------------------------+
| 4     | DebMatheny      | RT @DebMatheny: #Minnesota #ChildSexTrafficking    |
|       |                 | #PedophiliaRing #CPS #HumanServices                |
|       |                 | #CharitableFoundations! https://t.co/Fu1FfVrbvz    |
|       |                 | https:/…                                           |
| 3     | MsToya1913      | RT @MsToya1913: In #Illinois, real damage is being |
|       |                 | done to our neediest children and families &amp;   |
|       |                 | to our #humanservices @unitedwaychi                |
|       |                 |                                                    |
|       |                 | https:…                                            |
| 3     | MsToya1913      | 