## Extract Twitter Handles (@-strings) from training data
We want to use them to select tweets.

Prepare test file for measuring accuracy if the Sentiment Analyzer (in the Scala code). Output JSON objects, one per line.

In [59]:
import sqlite3, codecs
from json import dumps

conn = sqlite3.connect('../data/airline-twitter-sentiment/database.sqlite')

# Turn positive -> +, negative -> -, neuteral -> =.
def to_symbol(s):
    if s.lower().startswith('pos'): return '+'
    elif s.lower().startswith('neg'): return '-'
    else: return '='

sql = 'select airline_sentiment, text from tweets'

js_docs = (dumps({'polarity': to_symbol(polarity), 
                  'text': text}) 
           for (polarity, text) in conn.execute(sql))

with codecs.open('../data/airline-twitter-sentiment/tweets.json', mode='w+', encoding='utf-8') as os:
    os.write(u'\n'.join(js_docs))

In [66]:
!head ../data/airline-twitter-sentiment/tweets.json

{"polarity": "=", "text": "@JetBlue's new CEO seeks the right balance to please passengers and Wall ... - Greenfield Daily Reporter http://t.co/LM3opxkxch"}
{"polarity": "-", "text": "@JetBlue is REALLY getting on my nerves !! \ud83d\ude21\ud83d\ude21 #nothappy"}
{"polarity": "-", "text": "@united yes. We waited in line for almost an hour to do so. Some passengers just left not wanting to wait past 1am."}
{"polarity": "-", "text": "@united the we got into the gate at IAH on time and have given our seats and closed the flight. If you know people is arriving, have to wait"}
{"polarity": "-", "text": "@SouthwestAir its cool that my bags take a bit longer, dont give me baggage blue balls-turn the carousel on, tell me it's coming, then not."}
{"polarity": "-", "text": "@united and don't hope for me having a nicer flight some other time, try to do things right. You sold me those tickets with that connetion"}
{"polarity": "-", "text": "@united I like delays less than you because I'm the

In [None]:
import re
from itertools import chain
from operator import concat
from string import split

def flatmap(f, sequence):
    "Apply a function that returns a sequence concatenating the results"
    return reduce(concat, map(f, sequence))


handle_pat = re.compile(r'@\w+')
handles = set(flatmap(handle_pat.findall, messages))

In [None]:
handles

In [34]:
import re
from itertools import ifilter

def lines(it):
    return ifilter(None, (item.rstrip() for item in it))

conf = {k: v for k, v in map(lambda l: re.split(r'\s+=\s+', l), 
                             filter(lambda l: l, lines(open('../etc/twitter.conf'))))}

In [35]:
import tweepy

auth = tweepy.OAuthHandler(conf['twitter4j.oauth.consumerKey'], conf['twitter4j.oauth.consumerSecret'])
auth.set_access_token(conf['twitter4j.oauth.accessToken'], conf['twitter4j.oauth.accessTokenSecret'])

# Construct the API instance
api = tweepy.API(auth)

In [39]:
import string 
handles = [v.lower() for _, v in map(lambda l: l.split(','), lines(open('../data/airline-twitter-sentiment/airline-handles')))]

In [43]:
class MyStreamListener(tweepy.StreamListener):
    """Override tweepy.StreamListener to add logic.
    """    
    def on_status(self, status):
        print (status.text, status.entities)
    
    def on_error(self, status_code):
        if status_code == 420:
            # returning False in on_data disconnects the stream
            return False

In [45]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)

In [46]:
myStream.filter(track=handles, languages=['en'], async=True)

In [55]:
myStream.disconnect()

In [56]:
myStream.running

False

In [None]:
myStream.listener.on_status(None)

In [48]:
open('../var/tweets')

(u'RT @big_yummy: Hey .@AmericanAir thanks to the gross incompetence of you ODR staff, I just missed my flight!  Thanks a lot! #nexttimeSWA .@\u2026', {u'user_mentions': [{u'id': 382126710, u'indices': [3, 13], u'id_str': u'382126710', u'screen_name': u'big_yummy', u'name': u'Lincoln Lobley'}, {u'id': 22536055, u'indices': [20, 32], u'id_str': u'22536055', u'screen_name': u'AmericanAir', u'name': u'American Airlines'}, {u'id': 7212562, u'indices': [139, 140], u'id_str': u'7212562', u'screen_name': u'SouthwestAir', u'name': u'Southwest Airlines'}], u'symbols': [], u'hashtags': [{u'indices': [124, 136], u'text': u'nexttimeSWA'}], u'urls': []})
(u'@AmericanAir Any reason why customer relations would just...stop responding?  Because this whole process has been a disaster.', {u'user_mentions': [{u'id': 22536055, u'indices': [0, 12], u'id_str': u'22536055', u'screen_name': u'AmericanAir', u'name': u'American Airlines'}], u'symbols': [], u'hashtags': [], u'urls': []})
