## Advanced twitter mining + MongoDB 

This notebook is a collection of examples reproduced from the book : *Mining the Social Web*. In addition, an implementation of a function to Render JSON into collapsible HTML, and a short introduction to MongoDB

In [1]:
import twitter
import json
import sys
import time
from urllib.error import URLError
from http.client import BadStatusLine
from collections import Counter

In [32]:
# Authentification
def oauth_login():
    
    CONSUMER_KEY = ''
    CONSUMER_SECRET = ''
    OAUTH_TOKEN = ''
    OAUTH_TOKEN_SECRET = ''
    
    auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                               CONSUMER_KEY, CONSUMER_SECRET)
    
    twitter_api = twitter.Twitter(auth=auth)
    return twitter_api

twitter_api = oauth_login()    

twitter_api

<twitter.api.Twitter at 0xb04de79208>

In [3]:
# Explore trends
def twitter_trends(twitter_api, woe_id):
    return twitter_api.trends.place(_id=woe_id)

twitter_api = oauth_login()

WORLD_WOE_ID = 1
world_trends = twitter_trends(twitter_api, WORLD_WOE_ID)
print (json.dumps(world_trends, indent=1))

US_WOE_ID = 23424977
us_trends = twitter_trends(twitter_api, US_WOE_ID)
print (json.dumps(us_trends, indent=1))

[
 {
  "created_at": "2018-01-23T03:08:44Z",
  "as_of": "2018-01-23T03:10:16Z",
  "trends": [
   {
    "name": "#RAW25",
    "url": "http://twitter.com/search?q=%23RAW25",
    "promoted_content": null,
    "tweet_volume": 394866,
    "query": "%23RAW25"
   },
   {
    "name": "#BBB18",
    "url": "http://twitter.com/search?q=%23BBB18",
    "promoted_content": null,
    "tweet_volume": 212047,
    "query": "%23BBB18"
   },
   {
    "name": "#OTGala12",
    "url": "http://twitter.com/search?q=%23OTGala12",
    "promoted_content": null,
    "tweet_volume": 474635,
    "query": "%23OTGala12"
   },
   {
    "name": "#TheBachelor",
    "url": "http://twitter.com/search?q=%23TheBachelor",
    "promoted_content": null,
    "tweet_volume": 48952,
    "query": "%23TheBachelor"
   },
   {
    "name": "Estrelas",
    "url": "http://twitter.com/search?q=Estrelas",
    "promoted_content": null,
    "tweet_volume": 140233,
    "query": "Estrelas"
   },
   {
    "name": "#LHHMIA",
    "url": "http://t

[
 {
  "created_at": "2018-01-23T03:08:44Z",
  "as_of": "2018-01-23T03:10:16Z",
  "trends": [
   {
    "name": "#RAW25",
    "url": "http://twitter.com/search?q=%23RAW25",
    "promoted_content": null,
    "tweet_volume": 394866,
    "query": "%23RAW25"
   },
   {
    "name": "#TheBachelor",
    "url": "http://twitter.com/search?q=%23TheBachelor",
    "promoted_content": null,
    "tweet_volume": 48952,
    "query": "%23TheBachelor"
   },
   {
    "name": "#LHHMIA",
    "url": "http://twitter.com/search?q=%23LHHMIA",
    "promoted_content": null,
    "tweet_volume": 17412,
    "query": "%23LHHMIA"
   },
   {
    "name": "#PumpRules",
    "url": "http://twitter.com/search?q=%23PumpRules",
    "promoted_content": null,
    "tweet_volume": null,
    "query": "%23PumpRules"
   },
   {
    "name": "Torrie Wilson",
    "url": "http://twitter.com/search?q=%22Torrie+Wilson%22",
    "promoted_content": null,
    "tweet_volume": null,
    "query": "%22Torrie+Wilson%22"
   },
   {
    "name": "Ja

In [4]:
import uuid
from IPython.display import display_javascript, display_html, display

# This is a function to Render JSON into collapsible HTML, check the reference: http://caldwell.github.io/renderjson/
class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 100px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

In [5]:
# Tweets search
def twitter_search(twitter_api, q, max_results=200, **kw):
    
    search_results = twitter_api.search.tweets(q=q, count=100, **kw)
    
    statuses = search_results['statuses']
    
    # Iterate through batches of results by following the cursor until we
    # reach the desired number of results, keeping in mind that OAuth users
    # can "only" make 180 search queries per 15-minute interval
    
    # Enforce a reasonable limit
    max_results = min(1000, max_results)
    
    for _ in range(10): # 10*100 = 1000
        try:
            next_results = search_results['search_metadata']['next_results']
        except KeyError as e: # No more results when next_results doesn't exist
            break
            
        # Create a dictionary from next_results, which has the following form:
        # ?max_id=313519052523986943&q=NCAA&include_entities=1
        kwargs = dict([ kv.split('=') 
                        for kv in next_results[1:].split("&") ])
        
        search_results = twitter_api.search.tweets(**kwargs)
        statuses += search_results['statuses']
        
        if len(statuses) > max_results: 
            break
            
    return statuses

twitter_api = oauth_login()

q = "Obama"
results = twitter_search(twitter_api, q, max_results=10)
        
RenderJSON(results[0])

In [6]:
def twitter_trends(twitter_api, woe_id):
    return twitter_api.trends.place(_id=woe_id)

twitter_api = oauth_login()

WORLD_WOE_ID = 1
world_trends = twitter_trends(twitter_api, WORLD_WOE_ID)
RenderJSON(world_trends[0])

In [7]:
US_WOE_ID = 23424977
us_trends = twitter_trends(twitter_api, US_WOE_ID)
RenderJSON(us_trends[0])

In [8]:
# use partial for better function call
from functools import partial

pp = partial(json.dumps, indent=1)

twitter_world_trends = partial(twitter_trends, twitter_api, WORLD_WOE_ID)

print (pp(twitter_world_trends()))

authenticated_twitter_search = partial(twitter_search, twitter_api)
results = authenticated_twitter_search("iPhone")
print (pp(results[0]))

authenticated_iphone_twitter_search = partial(authenticated_twitter_search, "iPhone")
results = authenticated_iphone_twitter_search()
print (pp(results[0]))

[
 {
  "created_at": "2018-01-23T03:03:50Z",
  "as_of": "2018-01-23T03:10:25Z",
  "trends": [
   {
    "name": "#RAW25",
    "url": "http://twitter.com/search?q=%23RAW25",
    "promoted_content": null,
    "tweet_volume": 382547,
    "query": "%23RAW25"
   },
   {
    "name": "#BBB18",
    "url": "http://twitter.com/search?q=%23BBB18",
    "promoted_content": null,
    "tweet_volume": 211226,
    "query": "%23BBB18"
   },
   {
    "name": "#OTGala12",
    "url": "http://twitter.com/search?q=%23OTGala12",
    "promoted_content": null,
    "tweet_volume": 474467,
    "query": "%23OTGala12"
   },
   {
    "name": "#TheBachelor",
    "url": "http://twitter.com/search?q=%23TheBachelor",
    "promoted_content": null,
    "tweet_volume": 47803,
    "query": "%23TheBachelor"
   },
   {
    "name": "#LHHMIA",
    "url": "http://twitter.com/search?q=%23LHHMIA",
    "promoted_content": null,
    "tweet_volume": 16898,
    "query": "%23LHHMIA"
   },
   {
    "name": "Sisu",
    "url": "http://twit

{
 "lang": "ja",
 "retweet_count": 0,
 "in_reply_to_status_id_str": null,
 "text": "iPhone\u30fb\u30b9\u30de\u30db\u30fb\u30b2\u30fc\u30e0\u30fb\u30d6\u30eb\u30fc\u30ec\u30a4\u8cb7\u3044\u53d6\u308a\u307e\u3059\uff01\n\u58ca\u308c\u305f iPhone \u3067\u3082\u5927\u6b53\u8fce\uff01\u305f\u3060\u3044\u307e \u8cb7\u53d6 \u30ad\u30e3\u30f3\u30da\u30fc\u30f3\u4e2d\u306b\u3064\u304d\u3001\u8907\u6570\u304a\u6301\u3061\u3044\u305f\u3060\u304f\u3068 \u91d1\u984d\uff35\uff30\uff01\n\u8cb7\u53d6 \u30d6\u30ed\u30b0\u3082\u3084\u3063\u3066\u307e\u3059\uff01\u8a73\u3057\u304f\u306f\u3053\u3061\u3089 \u2192 https://t.co/0iOzwzAqC2",
 "retweeted": false,
 "geo": null,
 "favorited": false,
 "place": null,
 "id": 955638758119948288,
 "source": "<a href=\"http://twirobo.com/\" rel=\"nofollow\">twiroboJP</a>",
 "truncated": false,
 "created_at": "Tue Jan 23 03:10:09 +0000 2018",
 "favorite_count": 0,
 "entities": {
  "symbols": [],
  "hashtags": [],
  "user_mentions": [],
  "urls": [
   {
    "url": "http

In [9]:
# Storing Json file
def json_file(filename, data = None, delay = 0.1):
    try:
        if data == None:
            with open('{0}.json'.format(filename), "r", encoding = "utf-8") as f:
                return json.load(f)
        else:
            with open('{0}.json'.format(filename), 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False)
    except:
        print('Check file')

    
q = 'bitcoin'

twitter_api = oauth_login()
results = twitter_search(twitter_api, q, max_results=10)

json_file(q, data=results)
test_res = json_file(q)

RenderJSON(test_res[0])

In [10]:
# Finding topics of interest by using the filtering capablities it offers.

# Query terms

q = 'bitcoin' # Comma-separated list of terms

print ('Filtering the public timeline for track="%s"' % (q,))

# Returns an instance of twitter.Twitter
twitter_api = oauth_login()

# Reference the self.auth parameter
twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)

stream = twitter_stream.statuses.filter(track=q)

count = 0
for tweet in stream:
    while count<10:
        print (tweet['text'])
        count += 1
    break


Filtering the public timeline for track="bitcoin"
RT @caviar0x: “Our solution diversifies outside the asset class by combining #crypto investments with the #RealEstate debt market,” said Ca…
RT @caviar0x: “Our solution diversifies outside the asset class by combining #crypto investments with the #RealEstate debt market,” said Ca…
RT @caviar0x: “Our solution diversifies outside the asset class by combining #crypto investments with the #RealEstate debt market,” said Ca…
RT @caviar0x: “Our solution diversifies outside the asset class by combining #crypto investments with the #RealEstate debt market,” said Ca…
RT @caviar0x: “Our solution diversifies outside the asset class by combining #crypto investments with the #RealEstate debt market,” said Ca…
RT @caviar0x: “Our solution diversifies outside the asset class by combining #crypto investments with the #RealEstate debt market,” said Ca…
RT @caviar0x: “Our solution diversifies outside the asset class by combining #crypto investments with th

In [11]:
# Extract tweet entities
def extract_tweet_entities(statuses):

    if len(statuses) == 0:
        return [], [], [], [], []
    
    screen_names = [ user_mention['screen_name'] 
                         for status in statuses
                            for user_mention in status['entities']['user_mentions'] ]
    
    hashtags = [ hashtag['text'] 
                     for status in statuses 
                        for hashtag in status['entities']['hashtags'] ]

    urls = [ url['expanded_url'] 
                     for status in statuses 
                        for url in status['entities']['urls'] ]
    
    symbols = [ symbol['text']
                   for status in statuses
                       for symbol in status['entities']['symbols'] ]
               
    # In some circumstances (such as search results), the media entity
    # may not appear
    try:
        media = [ media['url'] 
                         for status in statuses  
                            for media in status['entities']['media'] ]
    except:
        media = []

    return screen_names, hashtags, urls, media, symbols

# Sample usage

q = 'Bitcoin'

statuses = twitter_search(twitter_api, q)

screen_names, hashtags, urls, media, symbols = extract_tweet_entities(statuses)
    
# Explore the first five items for each...

print (json.dumps(screen_names[0:5], indent=1))
print (json.dumps(hashtags[0:5], indent=1))
print (json.dumps(urls[0:5], indent=1))
print (json.dumps(media[0:5], indent=1))
print (json.dumps(symbols[0:5], indent=1))

[
 "Thetubbygoat",
 "dogmouthbear",
 "ErikVoorhees",
 "dougvk",
 "ReutersTech"
]
[
 "Cindicator",
 "bitcoin",
 "Security",
 "Bitcoin",
 "satoshi"
]
[
 "https://medium.com/p/how-to-cost-effectively-buy-cnd-cindicator-with-usd-d771b4918c42",
 "https://medium.com/@dougvk/run-your-own-mainnet-lightning-node-2d2eab628a8b",
 "http://feeds.reuters.com/~r/reuters/technologyNews/~3/ljFoE4qHrhE/south-korea-to-ban-cryptocurrency-traders-from-using-anonymous-bank-accounts-idUSKBN1FC069",
 "http://dlvr.it/QCLPDy",
 "https://forum.bitcoin.com/"
]
[]
[
 "CND",
 "BSTY",
 "BTC",
 "BTC",
 "LCD"
]


In [12]:
# Find the most popular tweets in a collection of tweets
def find_popular_tweets(twitter_api, statuses, retweet_threshold=100):
        
    return [ status
                for status in statuses 
                    if status['retweet_count'] > retweet_threshold ] 
    
# Sample usage

q = "bitcoin"

twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q, max_results=200)

popular_tweets = find_popular_tweets(twitter_api, search_results)

for tweet in popular_tweets:
    print (tweet['text'], tweet['retweet_count'])


RT @LucydLtd: Great day at The North American Bitcoin Conference. Excellent presentation by our CEO and very warm reception of Lucyd by att… 109
RT @dogmouthbear: นั่งกินข้าว อยู่ดีๆโต๊ะข้างๆก็เปิดประเด็นเรื่อง LGBTQ ละอีผช.แบบ "ผมว่ามันผิด มันจะทำให้ประชากรสูญพันธุ์" แล้ว ผญ คือดูอา… 9467
RT @dogmouthbear: นั่งกินข้าว อยู่ดีๆโต๊ะข้างๆก็เปิดประเด็นเรื่อง LGBTQ ละอีผช.แบบ "ผมว่ามันผิด มันจะทำให้ประชากรสูญพันธุ์" แล้ว ผญ คือดูอา… 9467
RT @ErikVoorhees: “Run your own mainnet Lightning Node” by @dougvk https://t.co/ojq13cnlz8 #bitcoin 152
RT @JonErlichman: Things that didn't exist when the Eagles &amp; Patriots played in the 2005 Super Bowl:

iPhone
Instagram
YouTube
Snapchat
Twi… 1858
RT @SudanGoldCoin: ABC News is now with us!! To find our more - visit: https://t.co/BTNO2OJpLz 
#SudanGC #Sudan #Gold #Coin #Money #blockch… 887
RT @Crypticsup: #cryptics #crowdsale #bitcoin  #ico How to start cryptocurrency trading
https://t.co/ETH2QCREde https://t.co/9WMz8jPhYU 105
RT @nytimes: Creating a 

In [13]:
# Finding the most popular tweet entities in a collection of tweets
def get_common_tweet_entities(statuses, entity_threshold=3):

    # Create a flat list of all tweet entities
    tweet_entities = [  e
                        for status in statuses
                            for entity_type in extract_tweet_entities([status]) 
                                for e in entity_type 
                     ]

    c = Counter(tweet_entities).most_common()

    # Compute frequencies
    return [ (k,v) 
             for (k,v) in c
                 if v >= entity_threshold
           ]



q = 'bitcoin'

twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q, max_results=100)
common_entities = get_common_tweet_entities(search_results)

print ("Most common tweet entities")
print (common_entities)

Most common tweet entities
[('bitcoin', 17), ('Bitcoin', 9), ('dogmouthbear', 9), ('cryptocurrency', 6), ('btc', 5), ('BTC', 4), ('Ethereum', 3), ('Crypto', 3), ('cryptocurrencies', 3), ('crypto', 3)]


In [14]:
from prettytable import PrettyTable

# Get some frequency data

twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q, max_results=100)
common_entities = get_common_tweet_entities(search_results)

# Use PrettyTable to create a nice tabular display

pt = PrettyTable(field_names=['Entity', 'Count']) 
[ pt.add_row(kv) for kv in common_entities ]
pt.align['Entity'], pt.align['Count'] = 'l', 'r' # Set column alignment
print (pt)


+------------------+-------+
| Entity           | Count |
+------------------+-------+
| bitcoin          |    15 |
| dogmouthbear     |    10 |
| Bitcoin          |     8 |
| cryptocurrency   |     6 |
| btc              |     4 |
| cryptocurrencies |     3 |
| crypto           |     3 |
| BTC              |     3 |
+------------------+-------+


In [15]:
# Finding users who have retweeted a status

twitter_api = oauth_login()

print ("""User IDs for retweeters of a tweet by @fperez_org
that was retweeted by @SocialWebMining and that @jyeee then retweeted
from @SocialWebMining's timeline\n""")
print (twitter_api.statuses.retweeters.ids(_id=334188056905129984)['ids'])
print (json.dumps(twitter_api.statuses.show(_id=334188056905129984), indent=1))

print ("@SocialWeb's retweet of @fperez_org's tweet\n")
print (twitter_api.statuses.retweeters.ids(_id=345723917798866944)['ids'])
print (json.dumps(twitter_api.statuses.show(_id=345723917798866944), indent=1))

print ("@jyeee's retweet of @fperez_org's tweet\n")
print (twitter_api.statuses.retweeters.ids(_id=338835939172417537)['ids'])
print (json.dumps(twitter_api.statuses.show(_id=338835939172417537), indent=1))

User IDs for retweeters of a tweet by @fperez_org
that was retweeted by @SocialWebMining and that @jyeee then retweeted
from @SocialWebMining's timeline

[17930287, 132373965, 13085242, 66592853, 25696968]
{
 "retweet_count": 5,
 "geo": null,
 "text": "Thrilled to see more books w/ computational content as IPython notebooks: 2nd Ed. @SocialWebMining by @ptwobrussell http://t.co/56giCSjcqw",
 "retweeted": false,
 "in_reply_to_status_id_str": null,
 "id_str": "334188056905129984",
 "favorited": false,
 "place": null,
 "coordinates": null,
 "id": 334188056905129984,
 "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
 "truncated": false,
 "contributors": null,
 "favorite_count": 9,
 "entities": {
  "symbols": [],
  "hashtags": [],
  "user_mentions": [
   {
    "name": "MiningTheSocialWeb",
    "id_str": "132373965",
    "indices": [
     82,
     98
    ],
    "id": 132373965,
    "screen_name": "SocialWebMining"
   },
   {
    "name": "Matthew Russell",
 

[]
{
 "retweet_count": 5,
 "geo": null,
 "text": "RT @fperez_org: Thrilled to see more books w/ computational content as IPython notebooks: 2nd Ed. @SocialWebMining by @ptwobrussell http://\u2026",
 "retweeted": false,
 "in_reply_to_status_id_str": null,
 "id_str": "338835939172417537",
 "favorited": false,
 "place": null,
 "coordinates": null,
 "id": 338835939172417537,
 "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
 "truncated": false,
 "contributors": null,
 "favorite_count": 0,
 "entities": {
  "symbols": [],
  "hashtags": [],
  "user_mentions": [
   {
    "name": "Fernando Perez",
    "id_str": "244991150",
    "indices": [
     3,
     14
    ],
    "id": 244991150,
    "screen_name": "fperez_org"
   },
   {
    "name": "MiningTheSocialWeb",
    "id_str": "132373965",
    "indices": [
     98,
     114
    ],
    "id": 132373965,
    "screen_name": "SocialWebMining"
   },
   {
    "name": "Matthew Russell",
    "id_str": "13085242",
    "indi

In [16]:
# Extracting a retweet's attribution

import re

def get_rt_attributions(tweet):

    # Regex adapted from Stack Overflow (http://bit.ly/1821y0J)

    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_attributions = []

    # Inspect the tweet to see if it was produced with /statuses/retweet/:id.
    # See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid.
    
    if 'retweeted_status' in tweet:
        attribution = tweet['retweeted_status']['user']['screen_name'].lower()
        rt_attributions.append(attribution)

    try:
        rt_attributions += [ 
                        mention.strip() 
                        for mention in rt_patterns.findall(tweet['text'])[0][1].split() 
                      ]
    except IndexError as e:
        pass

    # Filter out any duplicates

    return list(set([rta.strip("@").lower() for rta in rt_attributions]))

# Sample usage
twitter_api = oauth_login()

tweet = twitter_api.statuses.show(_id=214746575765913602)
print (get_rt_attributions(tweet))

tweet = twitter_api.statuses.show(_id=345723917798866944)
print (get_rt_attributions(tweet))

['jyeee']
['fperez_org']


In [17]:
# Making robust Twitter requests
def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): 
    
    # A nested helper function that handles common HTTPErrors. Return an updated
    # value for wait_period if the problem is a 500 level error. Block until the
    # rate limit is reset if it's a rate limiting issue (429 error). Returns None
    # for 401 and 404 errors, which requires special handling by the caller.
    def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
    
        if wait_period > 3600: # Seconds
            print ('Too many retries. Quitting.')
            raise e
        
        if e.e.code == 401:
            print ('Encountered 401 Error (Not Authorized)')
            return None
        elif e.e.code == 404:
            print ('Encountered 404 Error (Not Found)')
            return None
        elif e.e.code == 429: 
            print ('Encountered 429 Error (Rate Limit Exceeded)')
            if sleep_when_rate_limited:
                print ("Retrying in 15 minutes...ZzZ...")
                sys.stderr.flush()
                time.sleep(60*15 + 5)
                print ('...ZzZ...Awake now and trying again.')
                return 2
            else:
                raise e # Caller must handle the rate limiting issue
        elif e.e.code in (500, 502, 503, 504):
            print ('Encountered %i Error. Retrying in %i seconds' % \
                (e.e.code, wait_period))
            time.sleep(wait_period)
            wait_period *= 1.5
            return wait_period
        else:
            raise e

    # End of nested helper function
    
    wait_period = 2 
    error_count = 0 

    while True:
        try:
            return twitter_api_func(*args, **kw)
        except twitter.api.TwitterHTTPError as e:
            error_count = 0 
            wait_period = handle_twitter_http_error(e, wait_period)
            if wait_period is None:
                return
        except URLError as e:
            error_count += 1
            time.sleep(wait_period)
            wait_period *= 1.5
            print("URLError encountered. Continuing.")
            if error_count > max_errors:
                print("Too many consecutive errors...bailing out.")
                raise
        except BadStatusLine as e:
            error_count += 1
            time.sleep(wait_period)
            wait_period *= 1.5
            print("BadStatusLine encountered. Continuing.")
            if error_count > max_errors:
                print ("Too many consecutive errors...bailing out.")
                raise


twitter_api = oauth_login()

response = make_twitter_request(twitter_api.users.lookup, 
                                screen_name="deanabb")

print (json.dumps(response, indent=1))

[
 {
  "has_extended_profile": false,
  "translator_type": "none",
  "location": "San Diego",
  "profile_background_tile": true,
  "protected": false,
  "favourites_count": 522,
  "is_translation_enabled": false,
  "profile_use_background_image": true,
  "utc_offset": -28800,
  "default_profile": false,
  "profile_link_color": "0084B4",
  "profile_image_url_https": "https://pbs.twimg.com/profile_images/573000010591166464/-GEuAmBe_normal.jpeg",
  "status": {
   "lang": "en",
   "retweet_count": 2,
   "in_reply_to_status_id_str": null,
   "text": "Businesses Need Realistic Digital Marketing Expectations https://t.co/Px81TYsjyD",
   "retweeted": false,
   "geo": null,
   "favorited": false,
   "place": null,
   "id": 954790010695266304,
   "source": "<a href=\"http://www.linkedin.com/\" rel=\"nofollow\">LinkedIn</a>",
   "truncated": false,
   "created_at": "Sat Jan 20 18:57:32 +0000 2018",
   "favorite_count": 0,
   "entities": {
    "symbols": [],
    "hashtags": [],
    "user_mentions"

In [18]:
# Resolving user profile information
def get_user_profile(twitter_api, screen_names=None, user_ids=None):
   
    # Must have either screen_name or user_id (logical xor)
    assert (screen_names != None) != (user_ids != None), \
    "Must have screen_names or user_ids, but not both"
    
    items_to_info = {}

    items = screen_names or user_ids
    
    while len(items) > 0:

        # Process 100 items at a time per the API specifications for /users/lookup.
        # See https://dev.twitter.com/docs/api/1.1/get/users/lookup for details.
        
        items_str = ','.join([str(item) for item in items[:100]])
        items = items[100:]

        if screen_names:
            response = make_twitter_request(twitter_api.users.lookup, 
                                            screen_name=items_str)
        else: # user_ids
            response = make_twitter_request(twitter_api.users.lookup, 
                                            user_id=items_str)
    
        for user_info in response:
            if screen_names:
                items_to_info[user_info['screen_name']] = user_info
            else: # user_ids
                items_to_info[user_info['id']] = user_info

    return items_to_info

# Sample usage

twitter_api = oauth_login()

print (get_user_profile(twitter_api, screen_names=["deanabb", "kncukier"]))

{'deanabb': {'has_extended_profile': False, 'translator_type': 'none', 'location': 'San Diego', 'profile_background_tile': True, 'protected': False, 'favourites_count': 522, 'is_translation_enabled': False, 'profile_use_background_image': True, 'utc_offset': -28800, 'default_profile': False, 'profile_link_color': '0084B4', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/573000010591166464/-GEuAmBe_normal.jpeg', 'status': {'lang': 'en', 'retweet_count': 2, 'in_reply_to_status_id_str': None, 'text': 'Businesses Need Realistic Digital Marketing Expectations https://t.co/Px81TYsjyD', 'retweeted': False, 'geo': None, 'favorited': False, 'place': None, 'id': 954790010695266304, 'source': '<a href="http://www.linkedin.com/" rel="nofollow">LinkedIn</a>', 'truncated': False, 'created_at': 'Sat Jan 20 18:57:32 +0000 2018', 'favorite_count': 0, 'entities': {'symbols': [], 'hashtags': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/Px81TYsjyD', 'expanded_url': 'https://lnk

In [19]:
# Extracting tweet entities from arbitrary text
import twitter_text

txt = "Looking forward to giving a new workshop on March 6 at the 2018 @KNIME Summit in Berlin: 'The Power of Random - Course: Using Perturbation Experiments to Improve Model Accuracy and Interpretation' https://www.knime.com/about/events/the-power-of-random-course-using-perturbation-experiments-to-improve-model-accuracy … #KNIMESummit2018 #OpenSource"

ex = twitter_text.Extractor(txt)

print ("Screen Names:", ex.extract_mentioned_screen_names_with_indices())
print ("URLs:", ex.extract_urls_with_indices())
print ("Hashtags:", ex.extract_hashtags_with_indices())

Screen Names: [{'indices': [64, 70], 'screen_name': 'KNIME'}]
URLs: [{'url': 'https://www.knime.com/about/events/the-power-of-random-course-using-perturbation-experiments-to-improve-model-accuracy', 'indices': [197, 315]}]
Hashtags: [{'hashtag': 'KNIMESummit2018', 'indices': [318, 334]}, {'hashtag': 'OpenSource', 'indices': [335, 346]}]


In [20]:
# Getting all friends or followers for a user
from sys import maxsize

def get_friends_followers_ids(twitter_api, screen_name=None, user_id=None,
                              friends_limit=maxsize, followers_limit=maxsize):
    
    # Must have either screen_name or user_id (logical xor)
    assert (screen_name != None) != (user_id != None), \
    "Must have screen_name or user_id, but not both"
    
    # See https://dev.twitter.com/docs/api/1.1/get/friends/ids and
    # https://dev.twitter.com/docs/api/1.1/get/followers/ids for details
    # on API parameters
    
    get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids, 
                              count=5000)
    get_followers_ids = partial(make_twitter_request, twitter_api.followers.ids, 
                                count=5000)

    friends_ids, followers_ids = [], []
    
    for twitter_api_func, limit, ids, label in [
                    [get_friends_ids, friends_limit, friends_ids, "friends"], 
                    [get_followers_ids, followers_limit, followers_ids, "followers"]
                ]:
        
        if limit == 0: continue
        
        cursor = -1
        while cursor != 0:
        
            if screen_name: 
                response = twitter_api_func(screen_name=screen_name, cursor=cursor)
            else: # user_id
                response = twitter_api_func(user_id=user_id, cursor=cursor)

            if response is not None:
                ids += response['ids']
                cursor = response['next_cursor']
        
            print('Fetched {0} total {1} ids for {2}'.format(len(ids), 
                                                    label, (user_id or screen_name)))
        
            if len(ids) >= limit or response is None:
                break

    return friends_ids[:friends_limit], followers_ids[:followers_limit]


twitter_api = oauth_login()

friends_ids, followers_ids = get_friends_followers_ids(twitter_api, 
                                                       screen_name="deanabb", 
                                                       friends_limit=10, 
                                                       followers_limit=10)

print (friends_ids)
print (followers_ids)

Fetched 249 total friends ids for deanabb
Fetched 4705 total followers ids for deanabb
[1449955262, 5518682, 829108462244093952, 59481779, 2479516946, 4855085037, 913079610887294976, 763866327849238528, 722214536460173312, 4361928854]
[506089354, 3228024243, 826399829093924864, 955384264442466304, 350335365, 157526667, 857586472718880768, 955297146512838656, 955271968940490752, 4466770816]


In [21]:
# Analyzing a user's friends and followers
def setwise_friends_followers_analysis(screen_name, friends_ids, followers_ids):
    
    friends_ids, followers_ids = set(friends_ids), set(followers_ids)
    
    print ('{0} is following {1}'.format(screen_name, len(friends_ids)))

    print ('{0} is being followed by {1}'.format(screen_name, len(followers_ids)))
    
    print ('{0} of {1} are not following {2} back'.format(
            len(friends_ids.difference(followers_ids)), 
            len(friends_ids), screen_name))
    
    print ('{0} of {1} are not being followed back by {2}'.format(
            len(followers_ids.difference(friends_ids)), 
            len(followers_ids), screen_name))
    
    print ('{0} has {1} mutual friends'.format(
            screen_name, len(friends_ids.intersection(followers_ids))))


screen_name = "deanabb"

twitter_api = oauth_login()

friends_ids, followers_ids = get_friends_followers_ids(twitter_api, 
                                                       screen_name=screen_name)
setwise_friends_followers_analysis(screen_name, friends_ids, followers_ids)

Fetched 249 total friends ids for deanabb
Fetched 4705 total followers ids for deanabb
deanabb is following 249
deanabb is being followed by 4705
76 of 249 are not following deanabb back
4532 of 4705 are not being followed back by deanabb
deanabb has 173 mutual friends


In [22]:
#Analyzing tweet content
def analyze_tweet_content(statuses):
    
    if len(statuses) == 0:
        print ("No statuses to analyze")
        return
    
    # A nested helper function for computing lexical diversity
    def lexical_diversity(tokens):
        return 1.0*len(set(tokens))/len(tokens) 
    
    # A nested helper function for computing the average number of words per tweet
    def average_words(statuses):
        total_words = sum([ len(s.split()) for s in statuses ]) 
        return 1.0*total_words/len(statuses)

    status_texts = [ status['text'] for status in statuses ]
    screen_names, hashtags, urls, media, _ = extract_tweet_entities(statuses)
    
    # Compute a collection of all words from all tweets
    words = [ w 
          for t in status_texts 
              for w in t.split() ]
    
    print ("Lexical diversity (words):", lexical_diversity(words))
    print ("Lexical diversity (screen names):", lexical_diversity(screen_names))
    print ("Lexical diversity (hashtags):", lexical_diversity(hashtags))
    print ("Averge words per tweet:", average_words(status_texts))

    
# Sample usage

q = 'datascience'
twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q)

analyze_tweet_content(search_results)

Lexical diversity (words): 0.393154486586494
Lexical diversity (screen names): 0.4426229508196721
Lexical diversity (hashtags): 0.27419354838709675
Averge words per tweet: 16.37878787878788


In [23]:
# Analyzing a user's favorite tweets
def analyze_favorites(twitter_api, screen_name, entity_threshold=2):
    
    favs = twitter_api.favorites.list(screen_name=screen_name, count=200)
    print ("Number of favorites:", len(favs))
    
    # Figure out what some of the common entities are, if any, in the content
    
    common_entities = get_common_tweet_entities(favs, 
                                                entity_threshold=entity_threshold)
    
    # Use PrettyTable to create a nice tabular display
    
    pt = PrettyTable(field_names=['Entity', 'Count']) 
    [ pt.add_row(kv) for kv in common_entities ]
    pt.align['Entity'], pt.align['Count'] = 'l', 'r' # Set column alignment
    
    print
    print ("Common entities in favorites...")
    print (pt)
    
    
    # Print out some other stats
    print
    print ("Some statistics about the content of the favorities...")
    print
    analyze_tweet_content(favs)
    
    # Could also start analyzing link content or summarized link content, and more.

# Sample usage

twitter_api = oauth_login()
analyze_favorites(twitter_api, "deanabb")

Number of favorites: 199
Common entities in favorites...
+---------------------------------------------------------------------------+-------+
| Entity                                                                    | Count |
+---------------------------------------------------------------------------+-------+
| deanabb                                                                   |    83 |
| pawcon                                                                    |    23 |
| DataScience                                                               |    23 |
| BigData                                                                   |    14 |
| AI                                                                        |    11 |
| MachineLearning                                                           |    11 |
| datascience                                                               |     8 |
| SmarterHQ                                                                 |     8

In [24]:
#Saving and accessing JSON data with MongoDB
import pymongo 

def save_to_mongo(data, mongo_db, mongo_db_coll):
    
    # Connects to the MongoDB server running on 
    # localhost:27017 by default
    
    client = pymongo.MongoClient("localhost", 27017)
    
    # Get a reference to a particular database
    
    db = client[mongo_db]
    
    # Reference a particular collection in the database
    
    coll = db[mongo_db_coll]
    
    # Perform a bulk insert and  return the IDs
    
    return coll.insert(data)

def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False,
                    criteria=None, projection=None):
    
    # Optionally, use criteria and projection to limit the data that is 
    # returned as documented in 
    # http://docs.mongodb.org/manual/reference/method/db.collection.find/
    
    # Consider leveraging MongoDB's aggregations framework for more 
    # sophisticated queries.
    
    client = pymongo.MongoClient("localhost", 27017)
    db = client[mongo_db]
    coll = db[mongo_db_coll]
    
    if criteria is None:
        criteria = {}
    
    if projection is None:
        cursor = coll.find(criteria)
    else:
        cursor = coll.find(criteria, projection)

    # Returning a cursor is recommended for large amounts of data
    
    if return_cursor:
        return cursor
    else:
        return [ item for item in cursor ]

# Sample usage

q = 'bitcoin'

twitter_api = oauth_login()
results = twitter_search(twitter_api, q, max_results=10)

save_to_mongo(results, 'search_results', q)

load_from_mongo('search_results', q, criteria = {'retweet_count':{'$gt':10}})



[{'_id': ObjectId('5a668665f550052a34e978b3'),
  'contributors': None,
  'coordinates': None,
  'created_at': 'Tue Jan 23 00:48:32 +0000 2018',
  'entities': {'hashtags': [],
   'symbols': [],
   'urls': [],
   'user_mentions': [{'id': 2244340904,
     'id_str': '2244340904',
     'indices': [3, 10],
     'name': 'Recode',
     'screen_name': 'Recode'}]},
  'favorite_count': 0,
  'favorited': False,
  'geo': None,
  'id': 955603120813469696,
  'id_str': '955603120813469696',
  'in_reply_to_screen_name': None,
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'is_quote_status': False,
  'lang': 'en',
  'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
  'place': None,
  'retweet_count': 24,
  'retweeted': False,
  'retweeted_status': {'contributors': None,
   'coordinates': None,
   'created_at': 'Mon Jan 22 21:17:54 +0000 2018',
   'entities': {'hashtags': [],
    'symbols': [],
    

In [25]:
client = pymongo.MongoClient("localhost", 27017)
dr = client['search_results']
coll = dr[q]

In [26]:
dr

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'search_results')

In [27]:
coll

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'search_results'), 'bitcoin')

In [28]:
# make a query
currr = coll.find({'retweet_count':{'$gt':10}}).limit(1)

In [29]:
for cur in currr:
    print(cur)

{'_id': ObjectId('5a668665f550052a34e978b3'), 'retweet_count': 24, 'in_reply_to_status_id_str': None, 'text': 'RT @Recode: Bitcoin broker Coinbase booked $1 billion in revenue last year — so the company has told hovering VCs to back off https://t.co/…', 'retweeted': False, 'geo': None, 'favorited': False, 'place': None, 'coordinates': None, 'id': 955603120813469696, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'created_at': 'Tue Jan 23 00:48:32 +0000 2018', 'favorite_count': 0, 'entities': {'symbols': [], 'hashtags': [], 'user_mentions': [{'name': 'Recode', 'id_str': '2244340904', 'indices': [3, 10], 'id': 2244340904, 'screen_name': 'Recode'}], 'urls': []}, 'contributors': None, 'is_quote_status': False, 'in_reply_to_status_id': None, 'id_str': '955603120813469696', 'metadata': {'result_type': 'recent', 'iso_language_code': 'en'}, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name

In [30]:
# Count documents resulting from a query
fivestarcount = coll.find({'retweet_count': 10}).count()
print(fivestarcount)

0


In [31]:
print('\nThe sum of retween count accross all data grouped by location ')
stargroup=coll.aggregate(
# The Aggregation Pipeline is defined as an array of different operations
[
# The first stage in this pipe is to group data
{ 
    '$project':{
    'geo' :'$user.geo_enabled', 'location':'$user.location', 'retweet':'$retweet_count'}
},
{
    '$group':{'_id':'$location','total':{'$sum':'$retweet'}}
}
# Close the array with the ] tag             
] )
# Print the result
for group in stargroup:
    print(group)
    


The sum of retween count accross all data grouped by location 
{'total': 0, '_id': 'Cork, Ireland'}
{'total': 5930, '_id': 'Iraq'}
{'total': 24, '_id': 'Austin, TX'}
{'total': 8267, '_id': ' Thailand'}
{'total': 7, '_id': 'Petro metro'}
{'total': 38, '_id': 'Washington, DC'}
{'total': 77051, '_id': ''}
{'total': 215, '_id': 'Australia'}
{'total': 0, '_id': 'Moldova'}
{'total': 2, '_id': 'Miami, Florida'}
{'total': 1, '_id': 'Worldwide'}
{'total': 67, '_id': 'San Diego, CA'}
{'total': 0, '_id': 'Pune, India'}
{'total': 2, '_id': 'New Orleans, LA'}
{'total': 1, '_id': 'Montclair, NJ'}
{'total': 2, '_id': 'New Jersey, USA'}
{'total': 0, '_id': 'japan'}
{'total': 31, '_id': 'جدة, المملكة العربية السعودية'}
{'total': 0, '_id': 'New York'}
{'total': 28, '_id': 'Front Range, CO'}
{'total': 0, '_id': 'Sunderland'}
{'total': 0, '_id': 'North Chicago, IL/ Washington '}
{'total': 0, '_id': 'California'}
{'total': 0, '_id': 'Nairobi'}
{'total': 0, '_id': 'Jacksonville, FL'}
{'total': 8267, '_id':