# Stream tweets

## Import libraries

In [6]:
import tweepy
import pandas as pd
from datetime import datetime
import sys
import time
import os
from os.path import join
sys.path.insert(1, 'C:\\Users\\raide\\OneDrive\\Documents\\GitHub\\capstone_project\\scraping')
sys.path.insert(1, 'C:\\Users\\raide\\OneDrive\\Documents\\GitHub\\capstone_project\\constants')
from scrape_hashtags import get_hashtag_stats
from constants import get_matteo_twitter_creds, get_michael_twitter_creds

access_token, access_token_secret, consumer_key, consumer_secret = get_michael_twitter_creds()

## Create a wrapper for tweepy.Stream

In [7]:
class FoodStreamer(tweepy.Stream):
    
    # Create ability to specify a time limit in seconds for the scrape to run
    def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, max_retries, time_limit, path):
        self.max_retries = max_retries
        self.time_limit = time_limit
        self.start_time = time.time()
        self.captured_tweets = 0
        self.missed_tweets = 0
        self.run_path = path
        self.run_begin_time = datetime.today().strftime('%Y%m%d_%H%M%S_')
        super().__init__(consumer_key, consumer_secret, access_token, access_token_secret) # required for tweepy.Stream
        
    # Create dataframe when connection is established
    def on_connect(self):
        print('Stream has connected.\n')
        self.df = pd.DataFrame(columns = [  'created_at',
                                            'tweet_id',
                                            'user_id',
                                            'user_name',
                                            'screen_name',
                                            'verified',
                                            'text',
                                            'quote_tweet',
                                            'rewteet_count',
                                            'favorite_count',
                                            'place',
                                            'quote_status_id',
                                            'entities'])
        
    # When a tweet is retrieved, capture its information in a list and add it to the dataframe
    def on_status(self, status):
        # Limit stream runtime
        if time.time() - self.start_time < self.time_limit:
            try:
                created_at = status.created_at
                tweet_id = status.id
                user_id = status.user.id
                user_name = status.user.name
                screen_name = status.user.screen_name
                verified = status.user.verified
                text = status.text
                quote_tweet = status.is_quote_status
                rewteet_count = status.retweet_count
                favorite_count = status.favorite_count
                
                # Not nullable fields
                try:
                    place = status.place
                except:
                    place = None
                try:
                    quote_status_id = status.quoted_status_id
                except:
                    quote_status_id = None
                try:
                    entities = status.entities
                except:
                    entities = None

                # Create list of tweet info
                self.tweets = [ created_at,
                                tweet_id,
                                user_id,
                                user_name,
                                screen_name,
                                verified,
                                text,
                                quote_tweet,
                                rewteet_count,
                                favorite_count,
                                place,
                                quote_status_id,
                                entities]
                
                # Add tweet info to dataframe
                self.df.loc[len(self.df)] = self.tweets
                
                # Count tweets
                self.captured_tweets += 1
                print(f'Tweets streamed: {self.captured_tweets}')
            
            # If an error occurs, write the data to the directory and disconnect the stream
            except:
                filename = os.path.join(self.run_path, self.run_begin_time + 'exposed_food_tweets.csv')
                self.df.to_csv(filename, index=False, encoding='utf-8-sig')
                self.disconnect()
                
        # When the time limit is reached
        else:
            filename = os.path.join(self.run_path, self.run_begin_time + 'exposed_food_tweets.csv')
            self.df.to_csv(filename, index=False, encoding='utf-8-sig')
            self.disconnect()
    
    # Exception handling
    def on_limit(self, track):
        print('Limit has been reached...waiting...')
        self.missed_tweets += track

    def on_connection_error(self):
        print('on_connection_error')
    
    def on_exception(self, exception):
        print('An exception occurred:', exception)
        
    def on_request_error(self, status_code):
        print('An error occurred:', status_code)
            
    def on_closed(self, response):
        print('Response', response)
        print('\nConnection has closed')
    
    def on_disconnect(self):
        f = open(os.path.join(self.run_path, self.run_begin_time + 'results.txt'),"w+")
        f.write(f"Number of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}")
        f.close()
        try:
            print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        except:
            print('No tweets were found') 

## Get hashtag data
Using food words from [Enchanted Learning](https://www.enchantedlearning.com/wordlist/food.shtml), we can capture an abundance of food words and supply them to the [RiteKit's hashtag comparer](https://ritekit.com/developer/login/) to learn about the statistics of the hashtag over time.

In [8]:
def stream_query(hashtag_list, food_list, cutoff):
    """
    Calls scraping function `get_hashtag_stats` and adds supplied list of hashtags to the query for hashtag statistics. Returns a list of words whose hashtags (i.e., input 'python', check stats for '#python') have received >100K views in the last hour.
    
    Parameters:
    -----------
    hashtag_list (list): A list of strings of hashtags to get statistics on.
    
    Returns:
    --------
    exposed_hashtag_words (list): A list of strings of words where the hashtag of that word received >100K views in the past hour.
    """
    
    # Scrape for hashtags
    df = get_hashtag_stats(hashtags=hashtag_list, food_list=food_list)
    
    # Extract words and words as hashtags
    all_words = df.hashtag.tolist()
    all_hashtags = ['#' + s for s in all_words]
    
    # Filter hashtags by those that have been viewed more than 100K times in the past hour AND include those that we specified
    exposed_hashtags = df[(df.views_per_hour >= cutoff) | (df.hashtag.isin(hashtag_list))]
    exposed_hashtags_words = [s for s in exposed_hashtags.hashtag.tolist()]
    exposed_hashtags_hashtags = ['#' + s for s in exposed_hashtags_words]
    exposed_hashtags_and_words = exposed_hashtags_words + exposed_hashtags_hashtags
    print(f"Out of the {df.shape[0]} hashtags in the raw data, {exposed_hashtags.shape[0]} received >= {cutoff} views.")
    
    return exposed_hashtags_words, exposed_hashtags_hashtags, exposed_hashtags_and_words

## Define the scraper and run

In [9]:
def stream_tweets(hashtag_list, time_limit, loops, max_retries):
    # Create directory to save the runs to
    abspath = os.getcwd()
    data_path = os.path.join(abspath, 'data')
    run_begin_time = datetime.today().strftime('%Y%m%d_%H%M%S_')
    run_path = os.path.join(data_path, run_begin_time + 'run')
    os.mkdir(run_path)

    # Run for seconds in `time_limit`
    time_limit = time_limit
    max_retries = max_retries    

    # Loop to run for however many repeats over time set in time_limit
    for loop in range(0, loops):
        food_streamer = FoodStreamer(consumer_key, consumer_secret, access_token, access_token_secret, max_retries=max_retries, time_limit=time_limit, path=run_path) 
        exposed_hashtags_words, exposed_hashtags_hashtags, exposed_hashtags_and_words = stream_query(hashtag_list=hashtag_list, food_list=False)
        try:
            food_streamer.filter(track=exposed_hashtags_hashtags, languages=['en'])
        except:
            pass
    return

In [10]:
hashtag_list=['foodie', 'foodporn', 'food', 'delicious', 'love', 'recipes', 'eating', 'recipe', 'cook', 'cooking', 'restaurant', 'vegan', 'breakfast', 'lunch', 'foodgasm', 'foodies', 'nomnomnom', 'dinner']

stream_tweets(hashtag_list, time_limit=3600, loops=14, max_retries=10**6)

KeyboardInterrupt: 