In [1]:
import tweepy
import pandas as pd
from textblob import TextBlob
from datetime import datetime
import time
import ast
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import sys
sys.path.insert(1, 'C:\\Users\\raide\\OneDrive\\Documents\\GitHub\\capstone_project\\scraping')
sys.path.insert(1, 'C:\\Users\\raide\\OneDrive\\Documents\\GitHub\\capstone_project\\constants')
from scrape_hashtags import get_hashtag_stats
from constants import get_matteo_twitter_creds, get_michael_twitter_creds

access_token, access_token_secret, consumer_key, consumer_secret = get_michael_twitter_creds()

## Get hashtag data
Ideally, will have a way to get important hashtags, evaluate their exposure/popularity, and then feed it into the scraping function.

In [2]:
# Capture food words from https://www.enchantedlearning.com/wordlist/food.shtml
df = get_hashtag_stats()
df

In [None]:
df[['unique_tweets_per_hour', 'retweets_per_hour', 'views_per_hour']] = df[['unique_tweets_per_hour', 'retweets_per_hour', 'views_per_hour']].apply(pd.to_numeric)

In [None]:
all_words = df.hashtag.tolist()
all_hashtags = ['#' + s for s in all_words]

In [None]:
exposed_hashtags = df[df.views_per_hour >= 100000]
exposed_hashtags_words = [s for s in exposed_hashtags.hashtag.tolist()]
exposed_hashtags_hashtags = ['#' + s for s in exposed_hashtags_words]
exposed_hashtags_and_words = exposed_hashtags_words + exposed_hashtags_hashtags
print(f"Out of the {df.shape[0]} hashtags in the raw data, {exposed_hashtags.shape[0]} received >= 100000 views.")

Out of the 465 hashtags in the raw data, 27 received >= 100000 views.


In [None]:
class FoodScraper(tweepy.Stream):
    
    def on_connect(self):
        self.time_limit = 3600
        self.start_time = time.time()
        self.captured_tweets = 0
        self.missed_tweets = 0
        self.df = pd.DataFrame(columns = [  'created_at',
                                            'tweet_id',
                                            'user_id',
                                            'user_name',
                                            'screen_name',
                                            'verified',
                                            'text',
                                            'quote_tweet',
                                            'rewteet_count',
                                            'favorite_count',
                                            'place',
                                            'quote_status_id',
                                            'entities'])

    def on_status(self, status):
        if time.time() - self.start_time < self.time_limit:
            try:
                self.captured_tweets += 1
                created_at = status.created_at
                tweet_id = status.id
                user_id = status.user.id
                user_name = status.user.name
                screen_name = status.user.screen_name
                verified = status.user.verified
                text = status.text
                quote_tweet = status.is_quote_status
                rewteet_count = status.retweet_count
                favorite_count = status.favorite_count
                
                try:
                    place = status.place
                    quote_status_id = status.quoted_status_id
                    coordinates = status.coordinates
                except:
                    place = None
                    quote_status_id = None
                    coordinates = None
                    
                self.tweets = [ created_at,
                                tweet_id,
                                user_id,
                                user_name,
                                screen_name,
                                verified,
                                text,
                                quote_tweet,
                                rewteet_count,
                                favorite_count,
                                place,
                                quote_status_id,
                                coordinates]
                
                self.captured_tweets += 1
                self.df.loc[len(self.df)] = self.tweets
                print(f'Tweets streamed: {self.captured_tweets}')
            except:
                to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S_')
                path = r'C:\Users\raide\OneDrive\Documents\GitHub\capstone_project\data\stream'
                filename = '\\' + to_csv_timestamp + 'exposed_food_tweets.csv'
                self.df.to_csv(path + filename, index=False)
                self.disconnect()
        else:
            to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S_')
            path = r'C:\Users\raide\OneDrive\Documents\GitHub\capstone_project\data\stream'
            filename = '\\' + to_csv_timestamp + 'exposed_food_tweets.csv'
            self.df.to_csv(path + filename, index=False)
            self.disconnect()
    
    def on_limit(self, track):
        self.missed_tweets += track

    def on_connection_error(self):
        self.disconnect()
        print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        return 
        
    def on_closed(self, response):
        print('Response', response)
        self.disconnect()
        print('\nConnection has closed')
        return 
    
    def on_disconnect(self):
        self.disconnect()
        print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        return 
    
    def on_exception(self, exception):
        print('An exception occurred:', exception)
        self.disconnect()
        print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        return 
        
    def on_request_error(self, status_code):
        print('An error occurred:', status_code)
        self.disconnect()
        print(f'Stream has disconnected.\nNumber of tweets streamed: {self.captured_tweets}\nNumber of tweets missed: {self.missed_tweets}\nPercent of tweets streamed that were missed: {self.missed_tweets / self.captured_tweets * 100}')
        return 

In [None]:
# Figure out how to specify time to run in class object

food_scraper = FoodScraper(consumer_key, consumer_secret, access_token, access_token_secret, max_retries=10)        
# food_scraper.filter(track=exposed_hashtags_and_words, languages=['en'])

In [None]:
# Run for 12 hours...

for i in range(1, 13):
    try:
        df = food_scraper.filter(track=exposed_hashtags_and_words, languages=['en'])
    except:
        continue

Tweets streamed: 2
Tweets streamed: 4
Tweets streamed: 6
Tweets streamed: 8
Tweets streamed: 10
Tweets streamed: 12
Tweets streamed: 14
Tweets streamed: 16
Tweets streamed: 18
Tweets streamed: 20
Tweets streamed: 22
Tweets streamed: 24
Tweets streamed: 26
Tweets streamed: 28
Tweets streamed: 30
Tweets streamed: 32
Tweets streamed: 34
Tweets streamed: 36
Tweets streamed: 38
Tweets streamed: 40
Tweets streamed: 42
Tweets streamed: 44
Tweets streamed: 46
Tweets streamed: 48
Tweets streamed: 50
Tweets streamed: 52
Tweets streamed: 54
Tweets streamed: 56
Tweets streamed: 58
Tweets streamed: 60
Tweets streamed: 62
Tweets streamed: 64
Tweets streamed: 66
Tweets streamed: 68
Tweets streamed: 70
Tweets streamed: 72
Tweets streamed: 74
Tweets streamed: 76
Tweets streamed: 78
Tweets streamed: 80
Tweets streamed: 82
Tweets streamed: 84
Tweets streamed: 86
Tweets streamed: 88
Tweets streamed: 90
Tweets streamed: 92
Tweets streamed: 94
Tweets streamed: 96
Tweets streamed: 98
Tweets streamed: 100
Twe