In [1]:
import twitter
consumer_key = "Your consumer key"
consumer_secret = "Your consumer secret"
access_token = "Your access token"
access_token_secret = "Your access token secret"

# token, token_key, con_secret, con_secret_key
authorization = twitter.OAuth(access_token, access_token_secret, consumer_key, consumer_secret)
t = twitter.Twitter(auth=authorization)

In [2]:
import os
import json
data_folder = os.path.join(os.path.expanduser("~"), "Data", "research")
output_filename = os.path.join(data_folder, "python_tweets_07-10-2015.json")
tweets_output_filename = os.path.join(data_folder, "tweets_07-10-2015.json")

# Consider creating a new notebook to gather control group tweets
control_group_tweets_filename = os.path.join(data_folder, "control_tweets_07-10-2015.json")

In [3]:
original_users = []
tweets = []
user_ids = {}

search_results = t.search.tweets(q="python-filter:retweets", lang="en", count=100)['statuses']
for tweet in search_results:
    if 'text' in tweet:
        # record screen name, tweet's text and mapping of tweet to user
        original_users.append(tweet['user']['screen_name'])
        user_ids[tweet['user']['screen_name']] = tweet['user']['id']
        tweets.append(tweet['text'])

In [4]:
import os
model_filename = os.path.join(os.path.expanduser("~"), "Models", "research", "python_context.pkl")

In [5]:
from sklearn.base import TransformerMixin
from nltk.tokenize import word_tokenize 

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    """ 
    returns a list of dictionaries, where first dict is list of words in first tweet, and so on
    
    Key: a word
    Value: True or false depending if words was discovered
    """
    def transform(self, X):
        return [{word: True for word in word_tokenize(document)} for document in X]

In [6]:
from sklearn.externals import joblib

""" Load our model from Chapter 6"""
context_classifier = joblib.load(model_filename)

In [7]:
# prediction - are our tweets relevant to the Python programming language?
y_pred = context_classifier.predict(tweets)

In [8]:
relevant_tweets = [tweets[i] for i in range(len(tweets)) if y_pred[i] == 1]
relevant_users = [original_users[i] for i in range(len(tweets)) if y_pred[i] == 1]

In [9]:
print("Relevant users: " + str(len(relevant_users)))
relevant_users

Relevant users: 96


['cmyeaton',
 'badlogicgames',
 'Lesism',
 'wcmckeedotcom',
 'DavidBurnsworth',
 'IamIanHitchings',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'almtorta18',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'dreamintentions',
 'BrerTaylor',
 'radd_it',
 'SamSykesSwears',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'victor254news',
 'danielskirk',
 'keithwms',
 'ka11away',
 'shaybaycupcake',
 'BeatlesTube',
 'programmingncr',
 'echosplanet',
 

In [10]:
# todo: from each unique user, retrieve 120 unique (non-RT?) tweets

import numpy as np
unique_users = np.unique(relevant_users)
print("Relevant users: " + str(len(unique_users)))
unique_users

Relevant users: 36


array(['9_A_6', 'AndrewKoldenTV', 'BE8kUGJQ4uhyIVq', 'BeatlesTube',
       'BeyHiveInFrance', 'BrerTaylor', 'CutesyOriginals',
       'DavidBurnsworth', 'IamIanHitchings', 'Lesism', 'OfficialUKNews',
       'Python_Agent', 'SamSykesSwears', 'almtorta18', 'badlogicgames',
       'cmyeaton', 'danielskirk', 'dreamintentions', 'echosplanet',
       'erconger', 'eronim_encabo', 'jordanjphillip1', 'ka11away',
       'keithwms', 'kstrauser', 'programmingncr', 'pypi_updates',
       'python_spameggs', 'radd_it', 'shaybaycupcake', 'simbata3',
       'szescstopni', 'victor254news', 'wcmckeedotcom', 'wd_topics_us',
       'whatta_nerd'], 
      dtype='<U15')

In [11]:
# gets 120 of user's most recent tweets
import time
def get_tweets(user, count):
    try:
        results = [tweet['text'] for tweet in t.statuses.user_timeline(screen_name=user, count=count) if tweet['text']]
    except TypeError as e:
        if results is None:
            print("You probably reached your API limit, waiting for 5 minutes") 
            sys.stdout.flush()
            time.sleep(5*60)
        else:
            raise e
    except twitter.TwitterHTTPError as e:
        pass
    finally:
        print("User: " + user + " has been fetched.\n")
        time.sleep(60) 
    return results

In [12]:
tweets = {}
# only consider unique users
for screen_name in unique_users: 
    tweets[screen_name] = get_tweets(screen_name, 120)

User: 9_A_6 has been fetched.

User: AndrewKoldenTV has been fetched.

User: BE8kUGJQ4uhyIVq has been fetched.

User: BeatlesTube has been fetched.

User: BeyHiveInFrance has been fetched.

User: BrerTaylor has been fetched.

User: CutesyOriginals has been fetched.

User: DavidBurnsworth has been fetched.

User: IamIanHitchings has been fetched.

User: Lesism has been fetched.

User: OfficialUKNews has been fetched.

User: Python_Agent has been fetched.

User: SamSykesSwears has been fetched.

User: almtorta18 has been fetched.

User: badlogicgames has been fetched.

User: cmyeaton has been fetched.

User: danielskirk has been fetched.

User: dreamintentions has been fetched.

User: echosplanet has been fetched.

User: erconger has been fetched.

User: eronim_encabo has been fetched.

User: jordanjphillip1 has been fetched.

User: ka11away has been fetched.

User: keithwms has been fetched.

User: kstrauser has been fetched.

User: programmingncr has been fetched.

User: pypi_updates h

In [13]:
# save dict to file
import json
with open(tweets_output_filename, 'w') as fp:
    json.dump(tweets, fp)

# Consider pruning false positives from twitter disambiguation? (users)
len(tweets)

36