In [5]:
import tweepy
import pandas as pd
import numpy as np

import yaml

with open("configurations_twitter.yml", 'r') as stream:
    configs = yaml.load(stream)


def load_api():
    ''' Function that loads the twitter API after authorizing the user. '''

    consumer_key = configs['twitter']['consumer_key']
    consumer_secret = configs['twitter']['consumer_secret']
    access_token = configs['twitter']['access_token']
    access_secret = configs['twitter']['access_secret']
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    # load the twitter API via tweepy
    return tweepy.API(auth)

In [7]:
def ifExists(obj):
    try:
        if obj is None:
            return np.nan
        else:
            return obj
    except AttributeError:
        return np.nan

In [8]:
api = load_api()

In [4]:
time_limit = 1.5                           # runtime limit in hours
max_tweets = 100                           # number of tweets per search (will be
                                           # iterated over) - maximum is 100
min_days_old, max_days_old = 1,2

In [5]:
def tweet_search(api, query, max_tweets, max_id, since_id, geocode):
    ''' Function that takes in a search string 'query', the maximum
        number of tweets 'max_tweets', and the minimum (i.e., starting)
        tweet id. It returns a list of tweepy.models.Status objects. '''

    searched_tweets = []
    while len(searched_tweets) < max_tweets:
        remaining_tweets = max_tweets - len(searched_tweets)
        try:
            new_tweets = api.search(q=query, count=remaining_tweets,
                                    since_id=str(since_id),
                                    max_id=str(max_id-1))
#                                    geocode=geocode)
            print('found',len(new_tweets),'tweets')
            if not new_tweets:
                print('no tweets found')
                break
            searched_tweets.extend(new_tweets)
            max_id = new_tweets[-1].id
        except tweepy.TweepError:
            print('exception raised, waiting 15 minutes')
            print('(until:', dt.datetime.now()+dt.timedelta(minutes=15), ')')
            time.sleep(15*60)
            break # stop the loop
    return searched_tweets, max_id

In [6]:
import datetime as dt

def get_tweet_id(api, date='', days_ago=9, query='#metoo'):
    ''' Function that gets the ID of a tweet. This ID can then be
        used as a 'starting point' from which to search. The query is
        required and has been set to a commonly used word by default.
        The variable 'days_ago' has been initialized to the maximum
        amount we are able to search back in time (9).'''

    if date:
        # return an ID from the start of the given day
        td = date + dt.timedelta(days=1)
        tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
        tweet = api.search(q=query, count=1, until=tweet_date)
    else:
        # return an ID from __ days ago
        td = dt.datetime.now() - dt.timedelta(days=days_ago)
        tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
        # get list of up to 10 tweets
        tweet = api.search(q=query, count=10, until=tweet_date)
        print('search limit (start/stop):',tweet[0].created_at)
        # return the id of the first tweet in the list
        return tweet[0].id


In [143]:
min_id = get_tweet_id(api,days_ago=2)

('search limit (start/stop):', datetime.datetime(2017, 10, 15, 23, 59, 59))


In [166]:
hopeful_date = dt.datetime.now() - dt.timedelta(days=7)

datetime.datetime(2017, 10, 11, 21, 54, 36, 304400)

In [169]:
!mkdir resources_twitter

In [180]:
#loop
df = {}

start = dt.datetime.now()
end = start + dt.timedelta(hours=time_limit)
count, exitcount = 0, 0
max_id = -1

while dt.datetime.now() < end:
    count += 1
    print('count =',count)
    # collect tweets and update max_id
    x = api.search(q='#metoo',since_id = tweet_id,max_id = max_id, count = 100)
    max_id = x.max_id
    # write tweets to file in JSON format
    df_save = {}
    for tweet in x:
        
        if tweet.id not in df.keys():
            df[tweet.id]  = {}
            try:
                df[tweet.id]['timestamp'] = ifExists(tweet.created_at)
                df[tweet.id]['tweet'] = ifExists(tweet.text)
                df[tweet.id]['hastags'] = str([hashtag['text'] for hashtag in ifExists(tweet.entities['hashtags'])])
                df[tweet.id]['language_iso']= ifExists(tweet.metadata['iso_language_code'])
                df[tweet.id]['user_location'] = ifExists(str(tweet.user.location))
                df[tweet.id]['user_description'] = ifExists(str(tweet.user.description))
                df[tweet.id]['geo_existance'] = ifExists(tweet.geo)
                df[tweet.id]['retweet_count'] = ifExists(tweet.retweet_count)
                df[tweet.id]['favorited_count'] = ifExists(tweet.favorite_count)
                df[tweet.id]['user_followers']= ifExists(tweet.user.followers_count)
                df[tweet.id]['user_id'] =ifExists(tweet.user.id)

            except:
                continue
    
    df_save = df
    dataframe_save = pd.DataFrame(df_save).T.reset_index().rename(columns = {'index':'id'})
    dataframe_save.to_csv('resources_twitter/files_until_{0}.csv'.format(max_id), index=False,encoding='utf-8')

('count =', 1)
('count =', 2)
('count =', 3)
('count =', 4)
('count =', 5)
('count =', 6)
('count =', 7)
('count =', 8)
('count =', 9)
('count =', 10)
('count =', 11)
('count =', 12)
('count =', 13)
('count =', 14)
('count =', 15)
('count =', 16)
('count =', 17)
('count =', 18)
('count =', 19)
('count =', 20)
('count =', 21)
('count =', 22)
('count =', 23)
('count =', 24)
('count =', 25)
('count =', 26)
('count =', 27)
('count =', 28)
('count =', 29)
('count =', 30)
('count =', 31)
('count =', 32)
('count =', 33)
('count =', 34)
('count =', 35)
('count =', 36)
('count =', 37)
('count =', 38)
('count =', 39)
('count =', 40)
('count =', 41)
('count =', 42)
('count =', 43)
('count =', 44)
('count =', 45)
('count =', 46)
('count =', 47)
('count =', 48)
('count =', 49)
('count =', 50)
('count =', 51)
('count =', 52)
('count =', 53)
('count =', 54)
('count =', 55)
('count =', 56)
('count =', 57)
('count =', 58)
('count =', 59)
('count =', 60)
('count =', 61)
('count =', 62)
('count =', 63)
(

KeyboardInterrupt: 

In [181]:
dataframe = pd.DataFrame(df).T.reset_index().rename(columns = {'index':'id'})

In [188]:
dataframe.to_csv("resources_twitter/megafile_tweets_2017_10_18.csv",index=False,encoding='utf-8')

In [8]:
dataframe = pd.read_csv('resources_twitter/megafile_tweets_2017_10_18.csv')

In [9]:
dataframe[dataframe.user_id == 434441713]

Unnamed: 0,id,favorited_count,geo_existance,hastags,language_iso,retweet_count,timestamp,tweet,user_description,user_followers,user_id,user_location
7786,920709353753251841,0.0,,"[u'JCCC', u'MeToo']",en,0.0,2017-10-18 17:53:10,Did you know that #JCCC protects professors th...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10303,920712526521884672,1.0,,[],en,0.0,2017-10-18 18:05:46,@JCCCStudentLife @JCCCtweet Why do you protect...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10336,920712571325505536,0.0,,[],en,0.0,2017-10-18 18:05:57,@JCCCtweet @museumatPF Why do you protect prof...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10359,920712603835535372,0.0,,[],en,0.0,2017-10-18 18:06:05,@JCCCtweet @JCCCBookstore Why do you protect p...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10378,920712638874750985,0.0,,[],en,0.0,2017-10-18 18:06:13,@JCCCtweet @emporiastate Why do you protect pr...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10413,920712680113045504,0.0,,[u'MeToo'],en,0.0,2017-10-18 18:06:23,@JCCCtweet Why do you protect professors that ...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10440,920712712056983553,0.0,,[],en,0.0,2017-10-18 18:06:31,@JCSID @JCCCtweet Why do you protect professor...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10476,920712746630635521,0.0,,[u'MeToo'],en,0.0,2017-10-18 18:06:39,@JCCCtweet Why do you protect professors that ...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10503,920712785637597186,0.0,,[],en,0.0,2017-10-18 18:06:48,@JCCCtweet @CoLabJCCC Why do you protect profe...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
10530,920712817661050881,0.0,,[u'MeToo'],en,0.0,2017-10-18 18:06:56,@JCCCtweet Why do you protect professors that ...,"Animal Lover & Activist, Gamer, Traveler, #Veg...",121.0,434441713.0,Kansas City
