Permalink
Browse files

Remove old status ID when adding new ones.

  • Loading branch information...
1 parent 38b777f commit 96bf460c3451cd061e2f03ae5cee65bc4a07edd9 @mihaip committed Jan 20, 2012
Showing with 23 additions and 5 deletions.
  1. +2 −2 app/birdfeeder/TODO
  2. +21 −3 app/birdfeeder/handlers/update.py
View
@@ -1,11 +1,9 @@
todo:
-- encoding bug in https://twitter.com/#!/smcbride/status/158634839388590080
- trigger a Reader crawlondemand when first generating a feed (or resetting the ID)
- add timestamps to birdpinger
- see why Ann's feed doesn't use PSHB in Reader
- cache user data so that feed fetches involve no twitter RPCs
- reduce update cron job frequency
-- drop status ID/timestamps older than 24 hours from StreamData
- catch exceptions when fetching timeline tweets
- reply and retweet links
- use user timezone to format timestamps (instead of GMT)
@@ -64,3 +62,5 @@ done:
- youtube.com/youtu.be
- check if @ replies missing in birdpinger is Twitter's fault or tweetstream's
- convert newlines to html
+- encoding bug in https://twitter.com/#!/smcbride/status/158634839388590080 - hub dropped the < when giving it to schedule_crawler?
+- drop status ID/timestamps older than 24 hours from StreamData
@@ -1,3 +1,4 @@
+import itertools
import logging
import time
import urllib
@@ -13,6 +14,11 @@
RECENT_STATUS_INTERVAL_SEC = 10 * 60
+# Statuses older than this interval will be removed from the ID list that is
+# persisted, to avoid it growing uncontrollably. This value should be at least
+# as big as FEED_STATUS_INTERVAL_SEC from feeds.py
+OLD_STATUS_INTERVAL_SEC = 24 * 60 * 60 # One day
+
HUB_URL_BATCH_SIZE = 100
# When we get a ping, we don't start updates right away, since the timeline REST
@@ -121,9 +127,21 @@ def update_timeline(session):
logging.info(' %d new status IDs for this stream' % len(new_status_ids))
- stream.status_ids = new_status_ids + stream.status_ids
- stream.status_timestamps_sec = \
- new_status_timestamps_sec + stream.status_timestamps_sec
+ dropped_status_ids = 0
+ combined_status_ids = list(new_status_ids)
+ combined_status_timestamps_sec = list(new_status_timestamps_sec)
+ threshold_time = time.time() - OLD_STATUS_INTERVAL_SEC
+ for status_id, timestamp_sec in itertools.izip(stream.status_ids, stream.status_timestamps_sec):
+ if timestamp_sec >= threshold_time:
+ combined_status_ids.append(status_id)
+ combined_status_timestamps_sec.append(timestamp_sec)
+ else:
+ dropped_status_ids += 1
+
+ logging.info(' Dropped %d old status IDs' % dropped_status_ids)
+
+ stream.status_ids = combined_status_ids
+ stream.status_timestamps_sec = combined_status_timestamps_sec
unknown_status_ids = data.StatusData.get_unknown_status_ids(new_status_ids)

0 comments on commit 96bf460

Please sign in to comment.