# Stream tweets

## Import libraries

In [2]:
import tweepy
import pandas as pd
from datetime import datetime
import sys
import time
import os
from os.path import join
sys.path.insert(1, 'C:\\Users\\raide\\OneDrive\\Documents\\GitHub\\capstone_project\\scraping')
sys.path.insert(1, 'C:\\Users\\raide\\OneDrive\\Documents\\GitHub\\capstone_project\\constants')
from scrape_hashtags import get_hashtag_stats
from constants import get_matteo_twitter_creds, get_michael_twitter_creds

access_token, access_token_secret, consumer_key, consumer_secret = get_michael_twitter_creds()

## Get hashtag data
Using food words from [Enchanted Learning](https://www.enchantedlearning.com/wordlist/food.shtml), we can capture an abundance of food words and supply them to the [RiteKit's hashtag comparer](https://ritekit.com/developer/login/) to learn about the statistics of the hashtag over time.

In [2]:
# Capture food words from https://www.enchantedlearning.com/wordlist/food.shtml
df = get_hashtag_stats()
df[['unique_tweets_per_hour', 'retweets_per_hour', 'views_per_hour']] = df[['unique_tweets_per_hour', 'retweets_per_hour', 'views_per_hour']].apply(pd.to_numeric)
df

Unnamed: 0,hashtag,unique_tweets_per_hour,retweets_per_hour,views_per_hour
0,apple,129,96,351000
1,avocado,4,0,9800
2,bake,12,0,20458
3,banana,12,5,2542
4,barbecue,4,0,7462
...,...,...,...,...
460,wok,0,4,0
461,yeast,4,0,333
462,yogurt,4,0,4346
463,yolk,0,4,0


## Prepare data for streaming

In [4]:
# Extract words and words as hashtags
all_words = df.hashtag.tolist()
all_hashtags = ['#' + s for s in all_words]

In [5]:
# Filter hashtags by those that have been viewed more than 100K times in the past hour
exposed_hashtags = df[df.views_per_hour >= 100000]
exposed_hashtags_words = [s for s in exposed_hashtags.hashtag.tolist()]
exposed_hashtags_hashtags = ['#' + s for s in exposed_hashtags_words]
exposed_hashtags_and_words = exposed_hashtags_words + exposed_hashtags_hashtags
print(f"Out of the {df.shape[0]} hashtags in the raw data, {exposed_hashtags.shape[0]} received >= 100000 views.")

Out of the 465 hashtags in the raw data, 20 received >= 100000 views.


## Create a wrapper for tweepy.Stream

In [48]:
class FoodScraper(tweepy.Stream):
    
    # Create ability to specify a time limit in seconds for the scrape to run
    def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, max_retries, time_limit):
        self.max_retries = max_retries
        self.time_limit = time_limit
        self.start_time = time.time()
        self.captured_tweets = 0
        self.missed_tweets = 0
        super().__init__(consumer_key, consumer_secret, access_token, access_token_secret) # required for tweepy.Stream
        
    # Create dataframe when connection is established
    def on_connect(self):
        self.df = pd.DataFrame(columns = [  'created_at',
                                            'tweet_id',
                                            'user_id',
                                            'user_name',
                                            'screen_name',
                                            'verified',
                                            'text',
                                            'quote_tweet',
                                            'rewteet_count',
                                            'favorite_count',
                                            'place',
                                            'quote_status_id',
                                            'entities'])
        
    # When a tweet is retrieved, capture its information in a list and add it to the dataframe
    def on_status(self, status):
        # Limit stream runtime
        if time.time() - self.start_time < self.time_limit:
            try:
                self.captured_tweets += 1
                created_at = status.created_at
                tweet_id = status.id
                user_id = status.user.id
                user_name = status.user.name
                screen_name = status.user.screen_name
                verified = status.user.verified
                text = status.text
                quote_tweet = status.is_quote_status
                rewteet_count = status.retweet_count
                favorite_count = status.favorite_count
                
                # Not nullable fields
                try:
                    place = status.place
                    quote_status_id = status.quoted_status_id
                    hashtags = status.entities.hashtags
                except:
                    place = None
                    quote_status_id = None
                    hashtags = None

                # Create list of tweet info
                self.tweets = [ created_at,
                                tweet_id,
                                user_id,
                                user_name,
                                screen_name,
                                verified,
                                text,
                                quote_tweet,
                                rewteet_count,
                                favorite_count,
                                place,
                                quote_status_id,
                                hashtags]
                
                # Add tweet info to dataframe
                self.df.loc[len(self.df)] = self.tweets
                
                # Count tweets
                self.captured_tweets += 1
                print(f'Tweets streamed: {self.captured_tweets}')
            
            # If an error occurs, write the data to the directory and disconnect the stream
            except:
                to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S_')
                path = r'C:\Users\raide\OneDrive\Documents\GitHub\capstone_project\data\stream\enchanted_food_list'
                filename = '\\' + to_csv_timestamp + 'exposed_food_tweets.csv'
                self.df.to_csv(path + filename, index=False)
                self.disconnect()
                
        # When the time limit is reached
        else:
            to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S_')
            path = r'C:\Users\raide\OneDrive\Documents\GitHub\capstone_project\data\stream\enchanted_food_list'
            filename = '\\' + to_csv_timestamp + 'exposed_food_tweets.csv'
            self.df.to_csv(path + filename, index=False)
            self.disconnect()
    
    # Exception handling
    def on_limit(self, track):
        self.missed_tweets += track

    def on_connection_error(self):
        self.disconnect()
        try:
            print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        except:
            print('No tweets were found')
        return 
        
    def on_closed(self, response):
        print('Response', response)
        self.disconnect()
        print('\nConnection has closed')
        return 
    
    def on_disconnect(self):
        self.disconnect()
        try:
            print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        except:
            print('No tweets were found')
        return 
    
    def on_exception(self, exception):
        print('An exception occurred:', exception)
        try:
            print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        except:
            print('No tweets were found')
        return 
        
    def on_request_error(self, status_code):
        print('An error occurred:', status_code)
        try:
            print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        except:
            print('No tweets were found')
        return 

## Define the scraper and run for a specified length of time.

In [49]:
food_scraper = FoodScraper(consumer_key, consumer_secret, access_token, access_token_secret, max_retries=10, time_limit=3600)        

In [None]:
# Loop to run for however long you'd like

# Run for a day
for i in range(0, 24):
    # food_scraper = FoodScraper(consumer_key, consumer_secret, access_token, access_token_secret, max_retries=10, time_limit=3600)