# Notes on using python-twitter for direct capstone

first pip install python-twitter: `pip install python-twitter`

if you want, [readthedocs](https://python-twitter.readthedocs.io/en/latest/installation.html)

then setup a twitter app. I followed these [instructions](https://iag.me/socialmedia/how-to-create-a-twitter-app-in-8-easy-steps/)

The site seems to have changed a bit since that post but, more or less the same.

The important thing is to obtain your consumer key/secret and access key/secret.

You would instantiate them with twitter with the following:

In [9]:
import twitter
import pandas as pd
import json
import gensim
import collections
import time
import numpy as np

In [126]:
# This is just a way to handle our private twitter tokens
#   A skeletal .tweetrc file:
#     [Tweet]
#     consumer_key: *consumer_key*
#     consumer_secret: *consumer_password*
#     access_key: *access_key*
#     access_secret: *access_password*
class TweetRc(object):
    def __init__(self):
        self._config = None

    def GetConsumerKey(self):
        return self._GetOption('consumer_key')

    def GetConsumerSecret(self):
        return self._GetOption('consumer_secret')

    def GetAccessKey(self):
        return self._GetOption('access_key')

    def GetAccessSecret(self):
        return self._GetOption('access_secret')

    def _GetOption(self, option):
        try:
            return self._GetConfig().get('Tweet', option)
        except:
            return None

    def _GetConfig(self):
        if not self._config:
            self._config = configparser.ConfigParser()
            self._config.read(os.path.expanduser('~/.tweetrc')) 
        return self._config

In [127]:
tw = TweetRc()
api=twitter.Api(consumer_key=tw.GetConsumerKey(),
consumer_secret=tw.GetConsumerSecret(),
access_token_key=tw.GetAccessKey(),
access_token_secret=tw.GetAccessSecret())

Potentially **very** *neat* we can load up samples of live tweets:

In [62]:
# I'm curious what other filters we should use. Can we screen for tweeets that only
# also contain long/lat data?
timeout = time.time() + 10 #10 seconds from now
while True:
    with open("tweet_feed.json", 'w+') as f:
        for line in api.GetStreamFilter(track=['global warming', 'climate',
                                               'sustainability', 'pollution'],
                                       languages=['en'], filter_level=['low']):
            f.write(json.dumps(line))
            f.write("\r")
            timeDiff = datetime.datetime.now() - startTime
            if time.time() > timeout:
                break

TwitterError: {'message': 'Exceeded connection limit for user'}

In [54]:
#see what data is available for a given tweet
df = pd.read_json("big_tweet.json",  lines=True)
df.columns

Index(['contributors', 'coordinates', 'created_at', 'display_text_range',
       'entities', 'extended_entities', 'extended_tweet', 'favorite_count',
       'favorited', 'filter_level', 'geo', 'id', 'id_str',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'place',
       'possibly_sensitive', 'quote_count', 'quoted_status',
       'quoted_status_id', 'quoted_status_id_str', 'reply_count',
       'retweet_count', 'retweeted', 'retweeted_status', 'source', 'text',
       'timestamp_ms', 'truncated', 'user'],
      dtype='object')

In [105]:
#looks like some classic climate change fodder
print(df["text"][3])

RT @StopAdaniCairns: Seems #ClimateChange doesn’t concern #Farmers

Ignore #drought #floods #bushfire #cyclones 

#Agriculture production t…


In [66]:
#Load in Sarah's most recent model (75% accuracy on test set)
from keras.models import load_model
model = load_model("../../core/data/climate_sentiment_m1.h5")

In [104]:
####These should be ported into core module
def read_data(data_file):
    for i, line in enumerate (data_file): 
        # do some pre-processing and return a list of words for each review text
        yield gensim.utils.simple_preprocess (line)

def build_dataset(vocab, n_words):
    """Process the top n_words from raw inputs (vocab from read_data) into a dataset."""
    count = [['UNK', -1]] #stores when word is found --> UNK = unknown 
    count.extend(collections.Counter(vocab).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    token = list() 
    unk_count = 0
    for word in vocab: #
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK'] assigned to 0 
            unk_count += 1
        token.append(index) #outputs a list of integers that represent words
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #allows for word lookup by integer
    return token, count, dictionary, reversed_dictionary

###I'm being lazy and using 3 million words. need to shorten to whats been seen in our training data
def create_word_vec():
    #Get word vectors using googles pretrained word2vec  
    #takes a minute 
    google = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

    #includes some stop words (i.e. the, also, should, but not a, and, of)
    #includes misspellings 
    #includes commony paired words (i.e. New_York)

    vocab = google.vocab.keys()
    total_vocab = len(vocab)
    print ("Set includes", total_vocab, "words")

    # Copy word vectors and delete Word2Vec model  and original corpus to save memory
    X_vecs = google.wv
    #del google #wait to explore model first 
    del google 
    return X_vecs

In [80]:
top_words = 20000 #use number higher than expected unique words

tweet_vocab = list(read_data(df['text']))
flat_tweet_vocab = [item for sublist in tweet_vocab for item in sublist]
token, count, dictionary, reversed_dictionary = build_dataset(flat_tweet_vocab, top_words)

print("Number of unique words: {}".format(len(count))) #correct num of unique words 
#using gensim simple preprocesser 
print ('using gensim to preprocess:', tweet_vocab[3])

Number of unique words: 1996
using gensim to preprocess: ['rt', 'stopadanicairns', 'seems', 'climatechange', 'doesn', 'concern', 'farmers', 'ignore', 'drought', 'floods', 'bushfire', 'cyclones', 'agriculture', 'production']


In [88]:
###This is much slow
X_vecs = create_word_vec()

Set includes 3000000 words




In [90]:
# Create train and test sets
# Generate random indexes
X = df['text']
test_split = 0
train_size = int(len(X)*test_split)
test_size = len(X) - train_size
vector_size = 300
window_size = 10
max_tweet_length=512

indexes = set(np.random.choice(len(tweet_vocab), train_size + test_size, replace=False))
X_train = np.zeros((train_size, max_tweet_length, vector_size))
Y_train = np.zeros((train_size, 3), dtype=np.int32)
X_test = np.zeros((test_size, max_tweet_length, vector_size))
Y_test = np.zeros((test_size, 3), dtype=np.int32)
for i, index in enumerate(indexes):
    for t, token in enumerate(tweet_vocab[index]):
        if t >= max_tweet_length:
            break
        
        if token not in X_vecs:
            continue
    
        if i < train_size:
            X_train[i, t, :] = X_vecs[token]
        else:
            X_test[i - train_size, t, :] = X_vecs[token]


In [93]:
Y = model.predict(X_test)

Here's an example of a really strong "Yes" classification from our model:

In [103]:
i = 3
print("tweet: {} \n\nsentiment analysis: {}".format(df["text"][i],Y[i]))

tweet: RT @StopAdaniCairns: Seems #ClimateChange doesn’t concern #Farmers

Ignore #drought #floods #bushfire #cyclones 

#Agriculture production t… 

sentiment analysis: [0.01564297 0.98653907 0.005147  ]


**COOL!**

My next questions:

1. can we filter for only geo-tagged tweets? how many hits will we get a day then?
2. what other word filters should we use (up to 400)?
3. where do we want to store our data/handle calls to the api
 1. api calls every half hour? 
 2. CNN predictions once a day? 
 3. through away all but sentiment class and location after running CNN?