/
gettwitterfeed.py
154 lines (121 loc) · 4.47 KB
/
gettwitterfeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
#
# This script fetches the messages/tweets/history from your Twitter feed.
# It is meant to be included in another script or run in a python shell.
#
# Remo Giermann <rgiermann@liberejo.de>
# 2010/10/26
#
import sys, time, json, urllib, re
__feedurl = "http://api.twitter.com/1/statuses/user_timeline.json"
__max_pagecount = 16
__debug = True
def debug(s):
if __debug:
print s,
def convert_datetime(tstring):
"""Convert twitter's 'created_at' date/time-format to ISO-format."""
try:
t = tstring.replace('+0000', '')
return time.strftime("%Y-%m-%d %H:%M UTC", time.strptime(t))
except:
return tstring
def strip_html(data):
"""Get rid of HTML"""
p = re.compile(r'<.*?>')
return p.sub('', data)
def fetch_tweets(username, pages=1, count=20, include_rt=True):
"""
Fetches JSON feed of twitter history.
Returns a list containing one dictionary per tweet (resembling the JSON
structure). With default arguments, the 20 most recent messages will be
fetched.
Arguments:
----------
username: the screen name of the user's feed
pages: number of pages to be fetched (default: 1, max: 16)
count: number of messages per page (default: 20, max: 200)
include_rt: wether or not including ReTweets in fetched messages
(default: True)
"""
if pages > 1:
s = 's'
else:
s = ''
debug("Attempting to fetch %i page%s of %s's twitter feed.\n" % (pages, s, username))
if pages > __max_pagecount:
pages = __max_pagecount
debug("Count exceeds maximum (%i).\n" % __max_pagecount)
elif pages <= 0:
pages = 1
if count == 0:
count = 20
tweets = []
for p in range(1,pages+1):
url = __feedurl + "?screen_name=%s&include_rts=%i&page=%i&count=%i" % (username, include_rt, p, count)
debug("Fetching page %i..." % p)
feed = urllib.urlopen(url).readline()
debug("done.\n")
tweets += json.loads(feed)
return tweets
def fetch_all_tweets(username, include_rt=1):
"""
Try to get all messages from the feed (twitter limits this to 3200).
Arguments:
----------
username: the screen name of the user's feed
include_rt: wether or not including ReTweets in fetched messages
(default: True)
Returns a list of dictionaries.
"""
return fetch_tweets(username, pages=16, count=200, include_rt=include_rt)
def print_tweets(tweets):
"""
Prints `tweets` (returned from fetch_tweets() or simplify_tweets()).
Arguments:
----------
tweets: list of dictionaries (returned from fetch_tweets, fetch_all_tweets
or simplify_tweets)
"""
for t in tweets:
print t['text']
print strip_html(t['source'])
print convert_datetime(t['created_at'])
print
def simplify_tweets(tweets):
"""
Generates a simplified JSON structure from `tweets`.
Arguments:
----------
tweets: list of dictionaries (returned from fetch_tweets or fetch_all_tweets.
Returns a list of dictionaries resembling a simplified JSON-structure
containing only id, text, time and source of messages.
"""
jtweets = []
for t in tweets:
d = {'id':t['id'], 'text':t['text'], 'source':strip_html(t['source']), 'created_at': convert_datetime(t['created_at'])}
jtweets.append(d)
return jtweets
def tweets_to_file(tweets, fp, indent=4, as_json=True):
"""
Takes `tweets` and writes either a simplified JSON-structure or text to the
file pointed to by `fp`.
Arguments:
----------
tweets: list of dictionaries (returned from fetch_tweets, fetch_all_tweets
or simplify_tweets)
fp: a writable file pointer
indent: indentation level of JSON file (default: 4, 0 to only include newlines)
as_json: wether to write JSON or text to `fp` (default: True)
"""
if as_json is True:
if len(tweets[0]) > 3:
tweets = simplify_tweets(tweets)
for t in tweets:
json.dump(t, fp, indent=indent)
else:
for t in tweets:
fp.write(t['text'].encode('utf8')+'\n')
fp.write(strip_html(t['source']).encode('utf8')+'\n')
fp.write(convert_datetime(t['created_at']).encode('utf8')+'\n\n')
fp.flush()