# Scrapper for Starbucks

## Scraping

In [10]:
# Import necessary packages
import traceback
import snscrape.modules.twitter as sntwitter
import pandas as pd
from datetime import datetime, timedelta
from multiprocessing import Pool
from functools import partial

from helpers import scrape

In [11]:
# Declare start date and end date (today's date)
start_date = datetime(2022,9,1)
today = datetime.today()

In [12]:
# Create a list of dates to be looped
dates = []
date = start_date
while today > date:
    dates.append(str(date).split(" ")[0])
    date += timedelta(days=1)

In [13]:
# Scrape data
with Pool(6) as p:
    p.map(partial(scrape, dates=dates), range(len(dates)-1))

Running query: 'starbucks since:2022-09-10 until:2022-09-11'
Running query: 'starbucks since:2022-09-16 until:2022-09-17'
Running query: 'starbucks since:2022-09-01 until:2022-09-02'
Running query: 'starbucks since:2022-09-07 until:2022-09-08'
Running query: 'starbucks since:2022-09-13 until:2022-09-14'
Running query: 'starbucks since:2022-09-04 until:2022-09-05'
Tweets scrapped: 9684


Running query: 'starbucks since:2022-09-05 until:2022-09-06'
Tweets scrapped: 10398


Running query: 'starbucks since:2022-09-08 until:2022-09-09'
Tweets scrapped: 11819


Running query: 'starbucks since:2022-09-17 until:2022-09-18'
Tweets scrapped: 12848


Running query: 'starbucks since:2022-09-11 until:2022-09-12'
Tweets scrapped: 15247


Running query: 'starbucks since:2022-09-02 until:2022-09-03'
Tweets scrapped: 17095


Running query: 'starbucks since:2022-09-14 until:2022-09-15'
Tweets scrapped: 10066


Running query: 'starbucks since:2022-09-06 until:2022-09-07'
Tweets scrapped: 10445


Running 

## Combining Scrapped Data

In [14]:
import pandas as pd
import os
import glob


In [16]:
# Get all the csv files in the folder
path = './../data/starbucks/'
csv_files = glob.glob(os.path.join(path, "*.csv"))

In [None]:
# Combine the data by iteratively loading files
files = []
for f in csv_files:
    files.append(pd.read_csv(f, engine='python'))

final = pd.concat(files, axis=0, ignore_index=True)
    

In [22]:
final.to_hdf('./../data/starbucks/data.h5', key='starbucks')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_values] [items->Index(['Unnamed: 0', 'tweet', 'conversation_id', 'date', 'hashtags',
       'inReplyToTweetId', 'reply_to', 'language', 'likes_count', 'media',
       'mentions', 'quoted_tweet', 'retweets_count', 'link',
       'user_status_count', 'location', 'name', 'description', 'verified',
       'url', 'user_id', 'username'],
      dtype='object')]

  final.to_hdf('./../data/starbucks/data.h5', key='starbucks')


In [23]:
final.shape

(769439, 28)

## Supporting functions

```python
import traceback
import pandas as pd
from datetime import datetime, timedelta
import snscrape.modules.twitter as sntwitter


def scrape(x, dates):
    try:
        tweets_list= []
        query = f"starbucks since:{dates[x]} until:{dates[x+1]}"
        print(f"Running query: '{query}'")
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
            td = get_schema(tweet)
            tweets_list.append(td)
        n_tweets = len(tweets_list)
        print(f"Tweets scrapped: {n_tweets}")
        df = pd.DataFrame(tweets_list)
        df.to_csv(f'./../data/starbucks/{dates[x]}.csv')
        print("\n")
    except:
        traceback.print_exc()


def get_schema(tweet):
    return {   
        'tweet': tweet.content,
        'conversation_id': tweet.conversationId,
        'date':tweet.date,
        'hashtags':tweet.hashtags,
        'id':tweet.id,
        'inReplyToTweetId':tweet.inReplyToTweetId,
        'reply_to':tweet.inReplyToUser,
        'language':tweet.lang,
        'likes_count':tweet.likeCount,
        'media':tweet.media,
        'mentions':tweet.mentionedUsers,
        'quote_count':tweet.quoteCount,
        'quoted_tweet':tweet.quotedTweet,
        'replies_count':tweet.replyCount,
        'retweets_count':tweet.retweetCount,
        'link':tweet.url,
        'followers_count':tweet.user.followersCount,
        'following_count':tweet.user.friendsCount,
        'favourites_count':tweet.user.favouritesCount,
        'user_status_count':tweet.user.statusesCount,
        'location':tweet.user.location,
        'name':tweet.user.displayname,
        'description':tweet.user.description,
        'verified':tweet.user.verified,
        'url':tweet.user.linkUrl,
        'user_id':tweet.user.id,
        'username':tweet.user.username}
```