### Import packages / setup

In [1]:
import datetime
import tweepy
from tweepy import OAuthHandler
import json
import pandas as pd
import csv
import re
import string
import os
import time
import random
import numpy as np
from nltk.corpus import stopwords
sw = stopwords.words('english')
from string import punctuation
from collections import Counter, defaultdict
from pprint import pprint
from operator import itemgetter

# I've put my API keys in a .py file called API_keys.py
from my_api_keys import api_key, api_key_secret, access_token, access_token_secret

In [2]:
# Authenticate the Tweepy API
auth = tweepy.OAuthHandler(api_key,api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

In [3]:
# Modifications to punctuation and sw lists

punctuation = set(punctuation)
punctuation.add("’")

sw2 = set(sw)
addl = ("|","-","/","•","&", "&amp;")
sw2.update(addl)

### Function for scraping tweets

#### The function scrapes 15k tweets per day and stores in a CSV file

In [4]:
def scrapetweets(the_api, search_words, numtweets, numruns):
    
    # Define a for-loop to generate tweets at regular intervals
    # We cannot make large API call in one go. Hence, let's try numruns times

    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['user_id','screen_name','description','location','friends_count',
           'followers_count','totaltweets','date_created', 'tweet_id', 'retweetcount','full_text'])
    
    program_start = time.time()
    for i in range(0, numruns):
        # We will time how long it takes to scrape tweets for each run:
        start_run = time.time()
        
        # Collect tweets using the Cursor object
        # .Cursor() returns an object that you can iterate or loop over to access the data collected.
        # Each item in the iterator has various attributes that you can access to get information about each tweet
        tweets = tweepy.Cursor(the_api.search_tweets, 
                               q=search_words, 
                               lang="en", 
                               tweet_mode='extended'
                              ).items(numtweets)
        
        # Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]
        
        # Begin scraping the tweets individually:
        noTweets = 0
    
        for tweet in tweet_list:
            userid = tweet.user.id
            username = tweet.user.screen_name
            description = tweet.user.description
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            date_created = tweet.created_at
            tweet_id = tweet.id
            retweetcount = tweet.retweet_count
            full_text = tweet.full_text

            # Add the 11 variables to the empty list - ith_tweet:
            ith_tweet = [userid, username, description, location, following, followers, totaltweets,
                         date_created, tweet_id, retweetcount, full_text]

            # Append to dataframe - db_tweets
            db_tweets.loc[len(db_tweets)] = ith_tweet

            # increase counter - noTweets  
            noTweets += 1
                       
            
            
        # Run ended:
        end_run = time.time()
        duration_run = round((end_run-start_run)/60, 2)

        print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
        print('time taken for run {} to complete is {} mins'.format(i+1, duration_run))

        time.sleep(920) #15 minute sleep time between runs

    # Once all runs have completed, save them to a single csv file:
    
    # Obtain timestamp in a readable format
    to_csv_timestamp = datetime.date.today().strftime('%Y%m%d_%H%M%S')
    
    # Define working path and filename
    path = os.getcwd()
    filename = path + '/data/' + to_csv_timestamp + '_trailrunningtweets.csv'
    
    # Store dataframe in csv with creation date timestamp
    db_tweets.to_csv(filename, index = False)
    
    program_end = time.time()
    print("\n")
    print(f'Scraping for {startdate} to {enddate} has completed!')
    print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))

In [5]:

startdate = "2022-01-24"
enddate = "2022-01-30"

search_words = f'#trailrunning OR #running OR #run OR #trail OR #trailrunner OR #trailrun since:{startdate} until:{enddate} -filter:retweets'
numtweets=2500
numruns=6

scrapetweets(api, search_words, numtweets, numruns)

no. of tweets scraped for run 1 is 2500
time taken for run 1 to complete is 1.68 mins
no. of tweets scraped for run 2 is 2500
time taken for run 2 to complete is 1.66 mins
no. of tweets scraped for run 3 is 2500
time taken for run 3 to complete is 1.82 mins
no. of tweets scraped for run 4 is 2500
time taken for run 4 to complete is 1.95 mins
no. of tweets scraped for run 5 is 2500
time taken for run 5 to complete is 3.28 mins
no. of tweets scraped for run 6 is 2500
time taken for run 6 to complete is 2.71 mins


Scraping for 2022-01-24 to 2022-01-30 has completed!
Total time taken to scrape is 105.15 minutes.


### Issues to address

### Need to ideally also figure out how to scrape not just by the above keywords that search through the tweets text, but also search for "runner" in user descriptions. 

### Like this dude who only tweets about climate change, but identifies as a fell runner in his description. 
### https://twitter.com/endhunting/with_replies

#### 1) Set up code to do this pull weekly
#### 2) Set up automatic check to see if user is already in database. Ignore if so. 
#### 3) Store data in the cloud somewhere instead of locally
#### 4) Connect to marketing pipeline somehow