Test Twitter Pull
==========

We need the MTA's (`@NYCTSubway`) tweets to see where we're getting delays, and when they resolve.

In [2]:
%matplotlib inline

import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tweepy

In [3]:
with open('credentials.json', 'r') as f:
    creds = json.load(f)

auth = tweepy.OAuthHandler(creds['consumer_key'],
                           creds['consumer_secret'])
auth.set_access_token(creds['access_token_key'],
                      creds['access_token_secret'])
api = tweepy.API(auth)

In [17]:
mta_tweets = api.user_timeline(screen_name='NYCTSubway', count=200)
mta_tweets[0].text, mta_tweets[0].id, mta_tweets[0].created_at

('#ServiceAlert: Main St bound 7 trains running with delays, due to signal problems at Hunters Point. Allow additional travel time.',
 870061323232702464,
 datetime.datetime(2017, 5, 31, 23, 35, 59))

In [19]:
test = mta_tweets[0]
test._json['created_at']

'Wed May 31 23:35:59 +0000 2017'

In [21]:
from datetime import datetime

datestr = test._json['created_at']
dt = datetime.strptime(datestr, '%a %b %d %H:%M:%S %z %Y')
dt

datetime.datetime(2017, 5, 31, 23, 35, 59, tzinfo=datetime.timezone.utc)

Pull a bunch of tweets
---------------------------

Adapted from https://gist.github.com/yanofsky/5436496

In [28]:
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name='NYCTSubway', count=200)
tweets = new_tweets.copy()
oldest = tweets[-1].id - 1

while len(new_tweets) > 0:
    print(f'getting tweets before {oldest}')

    #all subsiquent requests use the max_id param to prevent duplicates
    new_tweets = api.user_timeline(screen_name='NYCTSubway', count=200, max_id=oldest)
    tweets.extend(new_tweets)
    oldest = tweets[-1].id - 1

#transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[t.id_str, t.created_at, t.text] for t in tweets
             if t.text.startswith('#ServiceAlert')]

import csv

#write the csv
with open('MTA_tweets.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'created_at', 'text'])
    writer.writerows(outtweets)

getting tweets before 869898260248494079
getting tweets before 869571143837126656
getting tweets before 869090461587210239
getting tweets before 868600831574237184
getting tweets before 868226378218582015
getting tweets before 867952590180409344
getting tweets before 867732443608035327
getting tweets before 867462984335073280
getting tweets before 867182900269248511
getting tweets before 866809474912268289
getting tweets before 866569185593315328
getting tweets before 865958061311238145
getting tweets before 865675763944570880
getting tweets before 865413297780137983
getting tweets before 865290134375325696
getting tweets before 864959304423657471
getting tweets before 864947194452901887
getting tweets before 864936284380430335


In [29]:
!wc MTA_tweets.csv

   898  18467 144419 MTA_tweets.csv


In [30]:
outtweets[0], outtweets[-1]

(['870068006403100672',
  datetime.datetime(2017, 6, 1, 0, 2, 32),
  '#ServiceAlert: n/b, 2 &amp; 5 trains are running with delays due to signal problems at Freeman St. Allow additional travel time.'],
 ['864936284380430336',
  datetime.datetime(2017, 5, 17, 20, 10, 54),
  '#ServiceAlert:s/b A and D trains are running local 125 St to 59 St, due to a sick passenger at 125 St. Allow additional travel time.'])

Pull Trains from Tweet
-------------------------

In [41]:
def filter_trains(text):
    replacements = ['&AMP;', '.', ',', ':']
    new_text = text.upper()
    for r in replacements:
        new_text = new_text.replace(r, '')
    
    # pull single characters
    new_text = new_text.split()
    trains = list(filter(lambda x: len(x) == 1, new_text))
    return trains

filter_trains(outtweets[0][2])

['2', '5']

Determine most delay tweets
---------------------------------

To guide this MVP, we can see which lines had the most delay notices, and work on just those lines.

In [43]:
from collections import Counter

trains = [filter_trains(t[2]) for t in outtweets]
flatten = lambda l: [item for sublist in l for item in sublist]
trains_flat = flatten(trains)

train_freq = Counter(trains_flat)
train_freq.most_common(10)

[('A', 253),
 ('F', 210),
 ('4', 153),
 ('E', 141),
 ('R', 107),
 ('W', 102),
 ('2', 97),
 ('5', 94),
 ('N', 91),
 ('6', 79)]