forked from ptwobrussell/Recipes-for-Mining-Twitter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
recipe__harvest_timeline.py
120 lines (87 loc) · 3.48 KB
/
recipe__harvest_timeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
import sys
import time
import twitter
import couchdb
from couchdb.design import ViewDefinition
from recipe__oauth_login import oauth_login
from recipe__make_twitter_request import make_twitter_request
def usage():
print 'Usage: $ %s <timeline name> [max_pages] [user]' % (sys.argv[0], )
print
print '\t<timeline name> in [home, user] and is required'
print '\t0 < max_pages <= 16 for timeline_name in [home, user]'
print 'Notes:'
print '\t* ~800 statuses are available from the home timeline.'
print '\t* ~3200 statuses are available from the user timeline.'
print '\t* The public timeline must now be accessed with the streaming API.'
print '\t* See https://dev.twitter.com/docs/api/1.1/get/statuses/sample for details'
exit()
if len(sys.argv) < 2 or sys.argv[1] not in ('home', 'user'):
usage()
if len(sys.argv) > 2 and not sys.argv[2].isdigit():
usage()
if len(sys.argv) > 3 and sys.argv[1] != 'user':
usage()
TIMELINE_NAME = sys.argv[1]
MAX_PAGES = int(sys.argv[2])
USER = None
KW = { # For the Twitter API call
'count': 200,
'trim_user': 'true',
'include_rts' : 'true',
'since_id' : 1,
}
if TIMELINE_NAME == 'user':
USER = sys.argv[3]
KW['screen_name'] = USER
if TIMELINE_NAME == 'home' and MAX_PAGES > 4:
MAX_PAGES = 4
if TIMELINE_NAME == 'user' and MAX_PAGES > 16:
MAX_PAGES = 16
t = oauth_login()
# Establish a connection to a CouchDB database
server = couchdb.Server('http://localhost:5984')
DB = 'tweets-%s-timeline' % (TIMELINE_NAME, )
if USER:
DB = '%s-%s' % (DB, USER)
try:
db = server.create(DB)
except couchdb.http.PreconditionFailed, e:
# Already exists, so append to it, keeping in mind that duplicates could occur
db = server[DB]
# Try to avoid appending duplicate data into the system by only retrieving tweets
# newer than the ones already in the system. A trivial mapper/reducer combination
# allows us to pull out the max tweet id which guards against duplicates for the
# home and user timelines. This is best practice for the Twitter v1.1 API
# See https://dev.twitter.com/docs/working-with-timelines
def id_mapper(doc):
yield (None, doc['id'])
# Find the maximum tweet id
def max_finding_reducer(keys, values, rereduce):
return max(values)
view = ViewDefinition('index', 'max_tweet_id', id_mapper, max_finding_reducer,
language='python')
view.sync(db)
try:
KW['since_id'] = int([_id for _id in db.view('index/max_tweet_id')][0].value)
except IndexError, e:
KW['since_id'] = 1
api_call = getattr(t.statuses, TIMELINE_NAME + '_timeline')
tweets = make_twitter_request(t, api_call, **KW)
db.update(tweets, all_or_nothing=True)
print 'Fetched %i tweets' % len(tweets)
page_num = 1
while page_num < MAX_PAGES and len(tweets) > 0:
# Necessary for traversing the timeline in Twitter's v1.1 API:
# Get the next query's max id parameter to pass in.
# See https://dev.twitter.com/docs/working-with-timelines
KW['max_id'] = min([ tweet['id'] for tweet in tweets]) - 1
api_call = getattr(t.statuses, TIMELINE_NAME + '_timeline')
tweets = make_twitter_request(t, api_call, **KW)
# Actually storing tweets in CouchDB is as simple as passing them
# into a call to db.update
db.update(tweets, all_or_nothing=True)
print >> sys.stderr, 'Fetched %i tweets' % (len(tweets),)
page_num += 1
print >> sys.stderr, 'Done fetching tweets'