Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
  • 16 commits
  • 15 files changed
  • 0 comments
  • 1 contributor
46 app/base/handlers.py
@@ -13,6 +13,9 @@
13 13
14 14 import base.constants
15 15
  16 +def _format_rfc_1123_date(date):
  17 + return wsgiref.handlers.format_date_time(time.mktime(date.timetuple()))
  18 +
16 19 class BaseHandler(webapp.RequestHandler):
17 20 def _render_template(self, template_file_name, template_values={}):
18 21 # Even though we set the DJANGO_SETTINGS_MODULE environment variable in
@@ -23,7 +26,7 @@ def _render_template(self, template_file_name, template_values={}):
23 26 # variable here.
24 27 # TODO(mihaip): figure out why this is happening
25 28 if 'DJANGO_SETTINGS_MODULE' not in os.environ:
26   - logging.error('DJANGO_SETTINGS_MODULE was not in the environment')
  29 + logging.warning('DJANGO_SETTINGS_MODULE was not in the environment')
27 30 os.environ['DJANGO_SETTINGS_MODULE'] = 'django_settings'
28 31
29 32 # Temporarily insert the template's directory into the template path,
@@ -68,31 +71,46 @@ def _write_input_error(self, error_message):
68 71 self._write_error(400)
69 72 self.response.out.write('Input error: %s' % error_message)
70 73
71   - def _write_json(self, obj):
  74 + def _write_json(self, obj, pretty_print=False):
72 75 self.response.headers['Content-Type'] = 'application/json'
73   - self.response.out.write(simplejson.dumps(obj))
  76 + self.response.out.write(
  77 + simplejson.dumps(obj, indent=pretty_print and 2 or None))
74 78
75   - def _handle_not_modified(self, last_modified_date):
  79 + def _get_if_modified_since(self):
76 80 if 'If-Modified-Since' not in self.request.headers:
77   - return False
  81 + return None
78 82
79   - if_modified_since_tuple = email.utils.parsedate(self.request.headers['If-Modified-Since'])
  83 + if_modified_since_tuple = email.utils.parsedate_tz(
  84 + self.request.headers['If-Modified-Since'])
80 85 if not if_modified_since_tuple:
  86 + return None
  87 + return email.utils.mktime_tz(if_modified_since_tuple)
  88 +
  89 + def _handle_not_modified(self, last_modified_date):
  90 + if_modified_since = self._get_if_modified_since()
  91 + if not if_modified_since:
81 92 return False
82   - if_modified_since = datetime.datetime(*if_modified_since_tuple[:6])
  93 +
  94 + if_modified_since = datetime.datetime.utcfromtimestamp(if_modified_since)
83 95 if if_modified_since < last_modified_date:
84 96 return False
85 97
86 98 self.response.set_status(304)
87 99 return True
88 100
89   - def _add_caching_headers(self, last_modified_date, max_age_sec):
90   - def format_date(date):
91   - return wsgiref.handlers.format_date_time(
92   - time.mktime(date.timetuple()))
  101 + def _add_last_modified_header(self, last_modified_date):
  102 + self.response.headers['Last-Modified'] = \
  103 + _format_rfc_1123_date(last_modified_date)
93 104
94   - self.response.headers['Last-Modified'] = format_date(last_modified_date)
95   - self.response.headers['Expires'] = format_date(
  105 + def _add_caching_headers(self, last_modified_date, max_age_sec):
  106 + self._add_last_modified_header(last_modified_date)
  107 + self.response.headers['Expires'] = _format_rfc_1123_date(
96 108 last_modified_date + datetime.timedelta(seconds=max_age_sec))
97   - self.response.headers['Cache-Control'] = 'public, max-age=%d' % max_age_sec
  109 + self.response.headers['Cache-Control'] = \
  110 + 'public, max-age=%d' % max_age_sec
  111 +
  112 + def _user_agent_contains(self, s):
  113 + if 'User-Agent' not in self.request.headers:
  114 + return False
98 115
  116 + return self.request.headers['User-Agent'].find(s) != -1
11 app/base/util.py
@@ -12,6 +12,14 @@
12 12 _TAG_WHITESPACE_RE1 = re.compile('>[\\s]+<([^a])')
13 13 _TAG_WHITESPACE_RE2 = re.compile('>[\\s]+<a')
14 14
  15 +# Per http://www.w3.org/TR/REC-xml/#charsets XML disallows all control
  16 +# characters...
  17 +_CONTROL_CHARACTER_MAP = dict.fromkeys(range(32))
  18 +# ...except for
  19 +del _CONTROL_CHARACTER_MAP[0x9] # tab,
  20 +del _CONTROL_CHARACTER_MAP[0xA] # line feed,
  21 +del _CONTROL_CHARACTER_MAP[0xD] # and newline.
  22 +
15 23 def strip_html_whitespace(html):
16 24 html = strip_spaces_between_tags(html)
17 25 html = _CONSECUTIVE_WHITESPACE_RE.sub(' ', html)
@@ -23,6 +31,9 @@ def generate_id(prefix):
23 31 return prefix + base64.urlsafe_b64encode(
24 32 uuid.uuid4().bytes).replace('=', '')
25 33
  34 +def strip_control_characters(s):
  35 + return s.translate(_CONTROL_CHARACTER_MAP)
  36 +
26 37 class JsonProperty(db.Property):
27 38 data_type = db.Blob
28 39
26 app/birdfeeder/TODO
... ... @@ -1,22 +1,23 @@
1 1 todo:
  2 +- trigger a Reader crawlondemand when first generating a feed (or resetting the ID)
  3 +- add timestamps to birdpinger
  4 +- see why Ann's feed doesn't use PSHB in Reader
2 5 - cache user data so that feed fetches involve no twitter RPCs
3   -- If-Modified-Since support in feed handler
4   -- refresh following data both server-side and in birdpinger periodically
5 6 - reduce update cron job frequency
6   -- drop status ID/timestamps older than 24 hours from StreamData
7 7 - catch exceptions when fetching timeline tweets
8 8 - reply and retweet links
9 9 - use user timezone to format timestamps (instead of GMT)
10 10 - "scavenger" to delete old tweets from StatusData?
11 11 - remove URLs (especially if at end?) from title snippet
  12 +- @mentions feed
  13 +- prune null values and useless keys (e.g. profile_background_image_url_https) from persisted statuses
12 14
13 15 thumbnail URLs to support:
14   -- youtube.com/youtu.be
15 16 - vimeo.com
16 17 - 4sq.com
17 18 - ow.ly/i (from https://twitter.com/doriegreenspan/status/153033123968466944)
18   -- lockerz.com (from https://twitter.com/wednesdaychef/status/153099094968123392)
19   -- cl.ly (from http://twitter.com/krave/status/154684703570530304) - thumbs.cl.ly per http://developer.getcloudapp.com/view-item
  19 +- img.ly (from http://twitter.com/mrgan/status/157981871307886592)
  20 +- pic.twitter.com (when manually inserted into tweets, it doesn't show up as a media entity - http://twitter.com/redrockcoffee/status/157687607613071360)
20 21
21 22 done:
22 23 - update oauth_template
@@ -51,3 +52,16 @@ done:
51 52 - tweak instagram URL: http://instagr.am/p/eb-bo/ (from http://twitter.com/jenna/status/154676233131659264)
52 53 - allow feed ID to be revoked
53 54 - Twitter Streaming API/PSHB proxy (in Go)
  55 +- strip control characters in tweets
  56 +- lockerz.com (from https://twitter.com/wednesdaychef/status/153099094968123392)
  57 +- cl.ly (from http://twitter.com/krave/status/154684703570530304)
  58 +- If-Modified-Since support in feed handler
  59 +- kick off feed update when first signing in
  60 +- show username that the feed is being generated for on homepage
  61 +- refresh following data both server-side and in birdpinger periodically
  62 +- youtube.com/youtu.be
  63 +- check if @ replies missing in birdpinger is Twitter's fault or tweetstream's
  64 +- convert newlines to html
  65 +- encoding bug in https://twitter.com/#!/smcbride/status/158634839388590080 - hub dropped the < when giving it to schedule_crawler?
  66 +- drop status ID/timestamps older than 24 hours from StreamData
  67 +- in birdpinger, filter out tweets that are in_reply_to_user_id for users that we don't care about (e.g. https://api.twitter.com/1/statuses/show.json?id=160246443020517376&include_entities=true)
39 app/birdfeeder/data.py
... ... @@ -1,6 +1,9 @@
  1 +import datetime
1 2 import itertools
  3 +import logging
2 4 import os
3 5
  6 +from google.appengine.api import taskqueue
4 7 from google.appengine.ext import db
5 8
6 9 from base.constants import CONSTANTS
@@ -70,6 +73,24 @@ def create_api(self):
70 73 ))
71 74 return api
72 75
  76 + def enqueue_update_task(
  77 + self,
  78 + countdown=None,
  79 + expected_status_id=None,
  80 + update_retry_count=None):
  81 + params = {}
  82 + if expected_status_id is not None:
  83 + params['expected_status_id'] = expected_status_id
  84 + if update_retry_count is not None:
  85 + params['update_retry_count'] = update_retry_count
  86 + params.update(self.as_dict())
  87 +
  88 + taskqueue.add(
  89 + queue_name='birdfeeder-update',
  90 + url='/tasks/bird-feeder/update',
  91 + countdown=countdown,
  92 + params=params)
  93 +
73 94 @staticmethod
74 95 def create(twitter_id, oauth_token, oauth_token_secret):
75 96 return Session(
@@ -178,10 +199,13 @@ def kind(cls):
178 199
179 200 class FollowingData(db.Model):
180 201 following_map = base.util.JsonProperty(indexed=False, required=True)
  202 + last_update_time = db.DateTimeProperty(auto_now=True)
181 203
182 204 _SINGLETON_ID = 'following_data'
  205 + _REFRESH_INTERVAL = datetime.timedelta(hours=1)
183 206
184 207 _following_map = None
  208 + _last_update_time = None
185 209
186 210 @staticmethod
187 211 def get_following_list():
@@ -199,8 +223,12 @@ def get_following_twitter_ids(twitter_id):
199 223
200 224 @staticmethod
201 225 def _is_stale():
202   - # TODO(mihaip): refresh every day or so
203   - return FollowingData._following_map == None
  226 + if (FollowingData._following_map is None or
  227 + FollowingData._last_update_time is None):
  228 + return True
  229 +
  230 + data_age = datetime.datetime.utcnow() - FollowingData._last_update_time
  231 + return data_age > FollowingData._REFRESH_INTERVAL
204 232
205 233 @staticmethod
206 234 def _update():
@@ -213,7 +241,9 @@ def _update():
213 241 for twitter_id, following_twitter_ids in stored_data.following_map.items():
214 242 twitter_id = int(twitter_id)
215 243 FollowingData._following_map[twitter_id] = following_twitter_ids
216   - return
  244 + FollowingData._last_update_time = stored_data.last_update_time
  245 + if not FollowingData._is_stale():
  246 + return
217 247
218 248 following_map = {}
219 249 for session in Session.all():
@@ -231,7 +261,8 @@ def _update():
231 261 stored_data.put()
232 262
233 263 FollowingData._following_map = following_map
  264 + FollowingData._last_update_time = stored_data.last_update_time
234 265
235 266 @classmethod
236 267 def kind(cls):
237   - return 'birdfeeder.FollowingData'
  268 + return 'birdfeeder.FollowingData'
26 app/birdfeeder/handlers/feed.py
... ... @@ -1,3 +1,4 @@
  1 +import calendar
1 2 import datetime
2 3 import logging
3 4 import time
@@ -7,6 +8,7 @@
7 8 import session
8 9
9 10 FEED_STATUS_INTERVAL_SEC = 24 * 60 * 60 # One day
  11 +IF_MODIFIED_SINCE_INTERVAL_SEC = 60 * 60 # One hour
10 12 MIN_FEED_ITEMS = 10
11 13
12 14 # Overrides the session accessors from SessionHandler to key the session
@@ -33,11 +35,28 @@ def _get_signed_in(self):
33 35
34 36 stream = data.StreamData.get_timeline_for_user(twitter_id)
35 37
  38 + threshold_time = time.time() - FEED_STATUS_INTERVAL_SEC
  39 +
  40 + # It's wasteful to serve the hub the full set of items in the feed, so
  41 + # we use a variant of the feed windowing technique described at
  42 + # http://code.google.com/p/pubsubhubbub/wiki/PublisherEfficiency#Feed_windowing
  43 + # to only give it new items. We treat the If-Modified-Since header as
  44 + # an indication of the items that the hub already has, but we allow one
  45 + # hour of overlap, in case of items getting dropped, replication delay,
  46 + # cosmic rays, etc.
  47 + if self._user_agent_contains('appid: pubsubhubbub'):
  48 + if_modified_since = self._get_if_modified_since()
  49 + if if_modified_since:
  50 + logging.info('If-Modified-Since: %d' % if_modified_since)
  51 + threshold_time = if_modified_since - IF_MODIFIED_SINCE_INTERVAL_SEC
  52 + # Since we're serving a partial response, we don't want proxies
  53 + # caching it.
  54 + self.response.headers['Cache-Control'] = 'private'
  55 +
36 56 # We want the feed to have all tweets from the past day, but also at
37 57 # at least 10 items.
38 58 feed_status_ids = []
39 59 if stream:
40   - threshold_time = time.time() - FEED_STATUS_INTERVAL_SEC
41 60 for status_id, status_timestamp_sec in stream.status_pairs():
42 61 if status_timestamp_sec < threshold_time and \
43 62 len(feed_status_ids) >= MIN_FEED_ITEMS:
@@ -59,11 +78,14 @@ def _get_signed_in(self):
59 78 for status in statuses
60 79 ]
61 80
  81 + updated_date = datetime.datetime.utcnow()
  82 +
62 83 self._write_template('birdfeeder/feed.atom', {
63 84 'feed_title': '@%s Twitter Timeline' % user.screen_name,
64   - 'updated_date_iso': datetime.datetime.utcnow().isoformat(),
  85 + 'updated_date_iso': updated_date.isoformat(),
65 86 'feed_url': self.request.url,
66 87 'status_groups': status_groups,
67 88 },
68 89 content_type='application/atom+xml')
69 90
  91 + self._add_last_modified_header(updated_date)
15 app/birdfeeder/handlers/pinger.py
... ... @@ -1,6 +1,5 @@
1 1 import logging
2 2
3   -from google.appengine.api import taskqueue
4 3
5 4 import base.handlers
6 5 from birdfeeder import data
@@ -32,17 +31,11 @@ def post(self):
32 31 for following_twitter_id in following_twitter_ids:
33 32 logging.info('Queueing update for %d' % following_twitter_id)
34 33 session = data.Session.get_by_twitter_id(str(following_twitter_id))
35   - params = {
36   - 'expected_status_id': update_status_id,
37   - 'update_retry_count': 0,
38   - }
39   - params.update(dict(session.as_dict()))
40   -
41   - taskqueue.add(
42   - queue_name='birdfeeder-update',
43   - url='/tasks/bird-feeder/update',
  34 +
  35 + session.enqueue_update_task(
44 36 countdown=birdfeeder.handlers.update.PING_UPDATE_DELAY_SEC,
45   - params=params)
  37 + expected_status_id=update_status_id,
  38 + update_retry_count=0)
46 39
47 40 self.response.out.write(
48 41 'Queued %d updates' % len(following_twitter_ids))
1  app/birdfeeder/handlers/session.py
@@ -164,6 +164,7 @@ def get(self):
164 164 session = data.Session.create(
165 165 twitter_id, access_token, access_token_secret)
166 166 session.put()
  167 + session.enqueue_update_task()
167 168
168 169 self._set_request_session(session)
169 170
33 app/birdfeeder/handlers/tools.py
... ... @@ -0,0 +1,33 @@
  1 +from django.utils import simplejson
  2 +
  3 +import base.handlers
  4 +from birdfeeder import data
  5 +import birdfeeder.handlers.feed
  6 +import birdfeeder.handlers.update
  7 +
  8 +# Helper handler (for development) that updates a single user's timeline and
  9 +# refreshes their feed within a single request.
  10 +class UpdateFeedHandler(base.handlers.BaseHandler):
  11 + def get(self):
  12 + session = data.Session.get_by_twitter_id(self.request.get('twitter_id'))
  13 +
  14 + birdfeeder.handlers.update.update_timeline(session)
  15 +
  16 + # We render the feed handler inline instead of redirecting to it, so
  17 + # that a browser reload will allow this handler (which also updates)
  18 + # to be triggered
  19 + feed_handler = birdfeeder.handlers.feed.TimelineFeedHandler()
  20 + feed_handler._session = session
  21 + feed_handler._api = session.create_api()
  22 + feed_handler.initialize(self.request, self.response)
  23 + feed_handler._get_signed_in()
  24 +
  25 +class StatusHandler(base.handlers.BaseHandler):
  26 + def get(self, status_id):
  27 + statuses = data.StatusData.get_by_status_ids([status_id])
  28 + if not statuses:
  29 + self._write_not_found()
  30 + return
  31 +
  32 + status = statuses[0]
  33 + self._write_json(status.original_json_dict, pretty_print=True)
60 app/birdfeeder/handlers/update.py
... ... @@ -1,3 +1,4 @@
  1 +import itertools
1 2 import logging
2 3 import time
3 4 import urllib
@@ -13,6 +14,11 @@
13 14
14 15 RECENT_STATUS_INTERVAL_SEC = 10 * 60
15 16
  17 +# Statuses older than this interval will be removed from the ID list that is
  18 +# persisted, to avoid it growing uncontrollably. This value should be at least
  19 +# as big as FEED_STATUS_INTERVAL_SEC from feeds.py
  20 +OLD_STATUS_INTERVAL_SEC = 24 * 60 * 60 # One day
  21 +
16 22 HUB_URL_BATCH_SIZE = 100
17 23
18 24 # When we get a ping, we don't start updates right away, since the timeline REST
@@ -29,10 +35,7 @@ def get(self):
29 35 update_task_count = 0
30 36 for session in data.Session.all():
31 37 update_task_count += 1
32   - taskqueue.add(
33   - queue_name='birdfeeder-update',
34   - url='/tasks/bird-feeder/update',
35   - params=session.as_dict())
  38 + session.enqueue_update_task()
36 39
37 40 self.response.out.write('Started %d updates' % update_task_count)
38 41
@@ -72,39 +75,14 @@ def post(self):
72 75 logging.info('...not found, queuing the %d-th retry' %
73 76 update_retry_count)
74 77
75   - params = {
76   - 'expected_status_id': expected_status_id,
77   - 'update_retry_count': update_retry_count,
78   - }
79   - params.update(dict(session.as_dict()))
80   -
81   - taskqueue.add(
82   - queue_name='birdfeeder-update',
83   - url='/tasks/bird-feeder/update',
  78 + session.enqueue_update_task(
84 79 countdown=update_retry_count * PING_UPDATE_DELAY_SEC,
85   - params=params)
  80 + expected_status_id=expected_status_id,
  81 + update_retry_count=update_retry_count)
86 82 except ValueError:
87 83 # Ignore mising/invalid values
88 84 return
89 85
90   -
91   -# Helper handler (for development) that updates a single user's timeline and
92   -# refreshes their feed within a single request.
93   -class UpdateFeedToolHandler(base.handlers.BaseHandler):
94   - def get(self):
95   - session = data.Session.get_by_twitter_id(self.request.get('twitter_id'))
96   -
97   - update_timeline(session)
98   -
99   - # We render the feed handler inline instead of redirecting to it, so
100   - # that a browser reload will allow this handler (which also updates)
101   - # to be triggered
102   - feed_handler = birdfeeder.handlers.feed.TimelineFeedHandler()
103   - feed_handler._session = session
104   - feed_handler._api = session.create_api()
105   - feed_handler.initialize(self.request, self.response)
106   - feed_handler._get_signed_in()
107   -
108 86 def update_timeline(session):
109 87 logging.info('Updating %s' % session.twitter_id)
110 88
@@ -149,9 +127,21 @@ def update_timeline(session):
149 127
150 128 logging.info(' %d new status IDs for this stream' % len(new_status_ids))
151 129
152   - stream.status_ids = new_status_ids + stream.status_ids
153   - stream.status_timestamps_sec = \
154   - new_status_timestamps_sec + stream.status_timestamps_sec
  130 + dropped_status_ids = 0
  131 + combined_status_ids = list(new_status_ids)
  132 + combined_status_timestamps_sec = list(new_status_timestamps_sec)
  133 + threshold_time = time.time() - OLD_STATUS_INTERVAL_SEC
  134 + for status_id, timestamp_sec in itertools.izip(stream.status_ids, stream.status_timestamps_sec):
  135 + if timestamp_sec >= threshold_time:
  136 + combined_status_ids.append(status_id)
  137 + combined_status_timestamps_sec.append(timestamp_sec)
  138 + else:
  139 + dropped_status_ids += 1
  140 +
  141 + logging.info(' Dropped %d old status IDs' % dropped_status_ids)
  142 +
  143 + stream.status_ids = combined_status_ids
  144 + stream.status_timestamps_sec = combined_status_timestamps_sec
155 145
156 146 unknown_status_ids = data.StatusData.get_unknown_status_ids(new_status_ids)
157 147
4 app/cron_tasks.py
@@ -18,6 +18,7 @@
18 18 from google.appengine.ext import webapp
19 19 from google.appengine.ext.webapp import util
20 20
  21 +import birdfeeder.handlers.tools
21 22 import birdfeeder.handlers.update
22 23 import feedplayback.handlers
23 24
@@ -28,7 +29,8 @@ def main():
28 29
29 30 ('/cron/bird-feeder/update', birdfeeder.handlers.update.UpdateCronHandler),
30 31 ('/tasks/bird-feeder/update', birdfeeder.handlers.update.UpdateTaskHandler),
31   - ('/tools/bird-feeder/update-feed', birdfeeder.handlers.update.UpdateFeedToolHandler),
  32 + ('/tools/bird-feeder/update-feed', birdfeeder.handlers.tools.UpdateFeedHandler),
  33 + ('/tools/bird-feeder/status/(\d+)', birdfeeder.handlers.tools.StatusHandler),
32 34 ],
33 35 debug=True)
34 36 util.run_wsgi_app(application)
59 app/datasources/thumbnails.py
... ... @@ -1,5 +1,14 @@
  1 +import logging
1 2 import re
  3 +import sys
  4 +import urllib
2 5 import urlparse
  6 +# parse_qsl moved to urlparse module in v2.6
  7 +try:
  8 + from urlparse import parse_qsl
  9 +except:
  10 + from cgi import parse_qsl
  11 +
3 12
4 13 LARGE_THUMBNAIL = 'large'
5 14 SMALL_THUMBNAIL = 'small'
@@ -14,6 +23,7 @@
14 23 _IMGUR_PATH_RE = re.compile('/(\\w+)(\\....).*')
15 24 _IMGUR_GALLERY_PATH_RE = re.compile('/(gallery/)(\\w+).*')
16 25 _TWITPIC_PATH_RE = re.compile('/(\\w+).*')
  26 +_LOCKERZ_PATH_RE = re.compile('/s/\\w+.*')
17 27
18 28 def _get_short_flickr_photo_id(photo_id):
19 29 result = ''
@@ -29,6 +39,11 @@ def get_thumb_url_for_short_photo_id(short_photo_id):
29 39 return 'http://flic.kr/p/img/%s_%s.jpg' % (
30 40 short_photo_id, size == SMALL_THUMBNAIL and 't' or 'm')
31 41
  42 + def get_youtube_thumb_url(video_id):
  43 + # See http://stackoverflow.com/questions/2068344
  44 + return 'http://img.youtube.com/vi/%s/%s.jpg' % (
  45 + video_id, need_small and 'default' or 'hqdefault')
  46 +
32 47 thumb_url = None
33 48 thumb_width = None
34 49 thumb_height = None
@@ -37,6 +52,8 @@ def get_thumb_url_for_short_photo_id(short_photo_id):
37 52 parsed_url = urlparse.urlparse(url)
38 53 hostname = parsed_url.netloc
39 54 path = parsed_url.path
  55 + query = dict(parse_qsl(parsed_url.query))
  56 +
40 57 if hostname == 'yfrog.com':
41 58 # See http://yfrog.com/page/api
42 59 match = _YFROG_PATH_RE.match(path)
@@ -118,5 +135,45 @@ def get_thumb_url_for_short_photo_id(short_photo_id):
118 135 if need_small:
119 136 thumb_width = 150
120 137 thumb_height = 150
  138 + elif hostname == 'lockerz.com':
  139 + # See http://support.lockerz.com/entries/350297-image-from-url
  140 + match = _LOCKERZ_PATH_RE.match(path)
  141 + if match:
  142 + thumb_url = 'http://api.plixi.com/api/tpapi.svc/imagefromurl?url=%s&size=%s' % (
  143 + urllib.quote(url),
  144 + need_small and 'small' or 'medium')
  145 + if need_small:
  146 + thumb_width = 150
  147 + thumb_height = 150
  148 + elif hostname == 'cl.ly':
  149 + # See http://developer.getcloudapp.com/view-item
  150 + thumb_url = 'http://thumbs.cl.ly%s' % path
  151 + elif hostname == 'youtube.com' or hostname == 'www.youtube.com':
  152 + if path == '/watch' and 'v' in query:
  153 + thumb_url = get_youtube_thumb_url(query['v'])
  154 + elif hostname == 'youtu.be':
  155 + thumb_url = get_youtube_thumb_url(path[1:])
  156 +
  157 + return thumb_url, thumb_width, thumb_height
  158 +
  159 +def get_iframe_info(url):
  160 + iframe_url = None
  161 + iframe_width = None
  162 + iframe_height = None
  163 +
  164 + parsed_url = urlparse.urlparse(url)
  165 + hostname = parsed_url.netloc
  166 + path = parsed_url.path
  167 + query = dict(parse_qsl(parsed_url.query))
  168 +
  169 + if hostname == 'youtube.com' or hostname == 'www.youtube.com':
  170 + if path == '/watch' and 'v' in query:
  171 + iframe_url = 'http://www.youtube.com/embed/%s' % query['v']
  172 + iframe_width = 560
  173 + iframe_height = 315
  174 + elif hostname == 'youtu.be':
  175 + iframe_url = 'http://www.youtube.com/embed/%s' % path[1:]
  176 + iframe_width = 560
  177 + iframe_height = 315
121 178
122   - return thumb_url, thumb_width, thumb_height
  179 + return iframe_url, iframe_width, iframe_height
37 app/datasources/twitterdisplay.py
@@ -4,6 +4,7 @@
4 4 import xml.sax.saxutils
5 5
6 6 from base.constants import CONSTANTS
  7 +import base.util
7 8 from datasources import thumbnails, twitter
8 9
9 10 _BASE_TWITTER_URL = 'https://twitter.com'
@@ -38,6 +39,7 @@ def permalink_no_base(self):
38 39 def title_as_text(self):
39 40 title_text = _unescape_tweet_chunk(self._status.text)
40 41 title_text = _WHITESPACE_RE.sub(' ', title_text)
  42 + title_text = base.util.strip_control_characters(title_text)
41 43 return '%s: %s' % (self._status.user.screen_name, title_text)
42 44
43 45 def created_at_formatted_gmt(self):
@@ -59,7 +61,21 @@ def add_raw_chunk(chunk):
59 61 def add_tweet_chunk(chunk):
60 62 # Unescape then and re-escape everything so that we can have a
61 63 # consistent level of escaping.
62   - add_escaped_chunk(_unescape_tweet_chunk(chunk))
  64 + chunk = _unescape_tweet_chunk(chunk)
  65 +
  66 + # We also remove control characters (which are not allowed in XML)
  67 + # now, instead of earlier, since otherwise all of the entity offsets
  68 + # would be wrong.
  69 + chunk = base.util.strip_control_characters(chunk)
  70 +
  71 + # HTML-escape
  72 + chunk = xml.sax.saxutils.escape(chunk)
  73 +
  74 + # Convert newlines to HTML (Twitter seems to normalize all line
  75 + # endings to \n).
  76 + chunk = chunk.replace('\n', '<br/>')
  77 +
  78 + add_raw_chunk(chunk)
63 79
64 80 def add_escaped_chunk(chunk):
65 81 add_raw_chunk(xml.sax.saxutils.escape(chunk))
@@ -79,7 +95,26 @@ def add_footer_thumbnail_chunk(
79 95 '</a>' %
80 96 (link_url , thumb_url, img_attributes))
81 97
  98 + def add_footer_iframe_chunk(iframe_url, iframe_width, iframe_height):
  99 + iframe_attributes = ''
  100 + if iframe_width and iframe_height:
  101 + iframe_attributes = ' width="%d" height="%d"' % (
  102 + iframe_width, iframe_height)
  103 + add_footer_raw_chunk(
  104 + '<iframe src="%s" frameborder="0"%s allowfullscreen></iframe>'
  105 + % (iframe_url, iframe_attributes))
  106 +
82 107 def maybe_add_thumbnail_chunk(url):
  108 + # If the caller is OK with large thumbnails, chances are they're
  109 + # OK with actual embedded content too.
  110 + if self._thumbnail_size == thumbnails.LARGE_THUMBNAIL:
  111 + iframe_url, iframe_width, iframe_height = \
  112 + thumbnails.get_iframe_info(url)
  113 + if iframe_url:
  114 + add_footer_iframe_chunk(
  115 + iframe_url, iframe_width, iframe_height)
  116 + return
  117 +
83 118 thumb_url, thumb_width, thumb_height = \
84 119 thumbnails.get_thumbnail_info(url, self._thumbnail_size)
85 120 if thumb_url:
2  app/templates/birdfeeder/index-signed-in.html
@@ -13,7 +13,7 @@
13 13 {% block body %}
14 14
15 15 <div id="birdfeeder-feed-container">
16   - <a href="{{ timeline_feed_url }}" class="feed-link">Timeline feed</a>
  16 + <a href="{{ timeline_feed_url }}" class="feed-link">@{{ twitter_user.screen_name }} timeline feed</a>
17 17 -
18 18 <a href="{{ timeline_reader_url }}">View in Google Reader</a>
19 19 </div>
2  birdpinger/README
@@ -4,7 +4,7 @@ To build and run
4 4 2. git clone https://github.com/hoisie/httplib.go.git
5 5 3. cd httplib.go
6 6 4. gomake install
7   -5. git clone https://github.com/hoisie/twitterstream.git
  7 +5. git clone https://github.com/mihaip/twitterstream.git
8 8 6. cd twitterstream
9 9 7. gomake install
10 10 8. cd birdpinger
74 birdpinger/main.go
@@ -7,11 +7,14 @@ import (
7 7 "io/ioutil"
8 8 "json"
9 9 "os"
  10 + "time"
10 11 "url"
11 12
12 13 "twitterstream"
13 14 )
14 15
  16 +const followingListUpdateIntervalNanosec = 1 * 60 * 60 * 1e9 // 1 hour
  17 +
15 18 var twitterUsername *string = flag.String("twitter_username", "", "Twitter account username to use to connect to the Streaming API")
16 19 var twitterPassword *string = flag.String("twitter_password", "", "Password for the Twitter account")
17 20 var streamSpigotHostname *string = flag.String("stream_spigot_hostname", "", "Host where Stream Spigot is running")
@@ -29,6 +32,55 @@ func main() {
29 32 followingUrl := fmt.Sprintf("%sfollowing?secret=%s", baseUrl, url.QueryEscape(*streamSpigotSecret))
30 33 pingUrl := baseUrl + "ping"
31 34
  35 + var followingUserIds []int64
  36 + var followingUserIdMap map[int64]bool
  37 +
  38 + stream := make(chan *twitterstream.Tweet)
  39 + updateFollowingListTick := time.Tick(followingListUpdateIntervalNanosec)
  40 +
  41 + client := twitterstream.NewClient(*twitterUsername, *twitterPassword)
  42 +
  43 + updateFollowingList := func() {
  44 + followingUserIds, followingUserIdMap = getFollowingList(followingUrl)
  45 +
  46 + fmt.Printf("Tracking updates for %d users...\n", len(followingUserIds))
  47 +
  48 + client.Close()
  49 + err := client.Follow(followingUserIds, stream)
  50 + if err != nil {
  51 + fmt.Println(err.String())
  52 + }
  53 + }
  54 +
  55 + updateFollowingList()
  56 +
  57 + for {
  58 + select {
  59 + case <-updateFollowingListTick:
  60 + updateFollowingList()
  61 + case tweet := <-stream:
  62 + // We ignore tweets that come from users that we're not following (the
  63 + // Streaming API will also notify when tweets of theirs are retweeted or
  64 + // replied to).
  65 + if _, inMap := followingUserIdMap[tweet.User.Id]; inMap {
  66 + // Similarly, we ignore tweets that are in reply to users that aren't
  67 + // being followed. This will have false negatives: if user A follows X
  68 + // and user B follows X and Z, a reply by X to Z will cause both A and
  69 + // B's streams to get pinged, even though A won't actually see that
  70 + // status. However, that should be rare.
  71 + if in_reply_to_user_id := tweet.In_reply_to_user_id; in_reply_to_user_id != 0 {
  72 + if _, inMap := followingUserIdMap[in_reply_to_user_id]; !inMap {
  73 + continue
  74 + }
  75 + }
  76 +
  77 + go pingUser(tweet.User.Id, tweet.Id, pingUrl)
  78 + }
  79 + }
  80 + }
  81 +}
  82 +
  83 +func getFollowingList(followingUrl string) (followingUserIds []int64, followingUserIdMap map[int64]bool) {
32 84 resp, getErr := http.Get(followingUrl)
33 85 if getErr != nil {
34 86 fmt.Printf("Got error %s when trying to fetch following list\n", getErr)
@@ -48,35 +100,17 @@ func main() {
48 100 }
49 101 resp.Body.Close()
50 102
51   - var followingUserIds []int64
52 103 jsonErr := json.Unmarshal(contents, &followingUserIds)
53 104 if jsonErr != nil {
54 105 fmt.Printf("Got error %s when trying to decode JSON\n", jsonErr)
55 106 os.Exit(1)
56 107 }
57 108
58   - followingUserIdMap := make(map[int64]bool)
  109 + followingUserIdMap = make(map[int64]bool)
59 110 for _, v := range followingUserIds {
60 111 followingUserIdMap[v] = true
61 112 }
62   -
63   - fmt.Printf("Tracking updates for %d users...\n", len(followingUserIds))
64   -
65   - stream := make(chan *twitterstream.Tweet)
66   - client := twitterstream.NewClient(*twitterUsername, *twitterPassword)
67   - err := client.Follow(followingUserIds, stream)
68   - if err != nil {
69   - fmt.Println(err.String())
70   - }
71   - for {
72   - tweet := <-stream
73   - // We ignore tweets that come from users that we're not following (the
74   - // Streaming API will also notify when tweets of theirs are retweeted or
75   - // replied to).
76   - if _, inMap := followingUserIdMap[tweet.User.Id]; inMap {
77   - go pingUser(tweet.User.Id, tweet.Id, pingUrl)
78   - }
79   - }
  113 + return
80 114 }
81 115
82 116 func pingUser(twitterId int64, statusId int64, pingUrl string) {

No commit comments for this range

Something went wrong with that request. Please try again.