In [67]:
# sample time
import random
import datetime
import time

# scraping
import subprocess
import json

# 
import pandas as pd
import csv

According to Kim et al. (2018), it is necessary to include around 38% of Twitter content in order to represent the entire period.

In [7]:
def sample_weeks(start_date, end_date):
    '''
    Sample about 38% of the weeks of start date to end date
    Input:
        Two dates in datetime.date format
    Output:
        A list of sampled dates that are ending cutoff points of the sampled weeks
    '''

    delta = datetime.timedelta(days=7) # 7-day time period

    current_date = start_date

    weeks = []

    while current_date <= end_date:
        period_start = current_date
        period_end = current_date + delta - datetime.timedelta(days=1)
        # the last end date can't exceed the sampling period
        if period_end > end_date:
            period_end = end_date
        weeks.append((period_start, period_end))
        current_date += delta

    sample_size = int(len(weeks) * 0.38)
    sampled_weeks = random.sample(weeks, sample_size)
    
    return sampled_weeks

The entire study period was categorized into the following three time periods: 
- pre-pandemic (March 1, 2019 - December 31, 2019)
- early-pandemic (March 1, 2020 - December 31, 2020)
- late-pandemic (March 1, 2022 - December 31, 2022)

In [8]:
# define the start/end dates of the three periods
# pre-pandemic period
p1_start = datetime.date(2019, 3, 1)
p1_end = datetime.date(2019, 12, 31)

# early-pandemic
p2_start = datetime.date(2020, 3, 1)
p2_end = datetime.date(2020, 12, 31)

# late-pandemic
p3_start = datetime.date(2022, 3, 1)
p3_end = datetime.date(2022, 12, 31)

In [9]:
# random sample the weeks for each period based on the pre-defined start and end date
p1_sample = sample_weeks(p1_start, p1_end)
p2_sample = sample_weeks(p2_start, p2_end)
p3_sample = sample_weeks(p3_start, p3_end)

In [10]:
# Define the hashtag and timeframe
hashtag = "fitspo"
language = 'en'
place = 'usa'

In [11]:
p1_sample[0]

(datetime.date(2019, 3, 22), datetime.date(2019, 3, 28))

In [12]:
# install snscrape
! pip install snscrape



In [65]:
for week in p1_sample:
    print(week)

(datetime.date(2019, 3, 22), datetime.date(2019, 3, 28))
(datetime.date(2019, 6, 21), datetime.date(2019, 6, 27))
(datetime.date(2019, 11, 1), datetime.date(2019, 11, 7))
(datetime.date(2019, 11, 29), datetime.date(2019, 12, 5))
(datetime.date(2019, 3, 15), datetime.date(2019, 3, 21))
(datetime.date(2019, 7, 5), datetime.date(2019, 7, 11))
(datetime.date(2019, 6, 14), datetime.date(2019, 6, 20))
(datetime.date(2019, 3, 29), datetime.date(2019, 4, 4))
(datetime.date(2019, 3, 1), datetime.date(2019, 3, 7))
(datetime.date(2019, 4, 5), datetime.date(2019, 4, 11))
(datetime.date(2019, 8, 9), datetime.date(2019, 8, 15))
(datetime.date(2019, 4, 26), datetime.date(2019, 5, 2))
(datetime.date(2019, 4, 12), datetime.date(2019, 4, 18))
(datetime.date(2019, 11, 8), datetime.date(2019, 11, 14))
(datetime.date(2019, 5, 24), datetime.date(2019, 5, 30))
(datetime.date(2019, 8, 16), datetime.date(2019, 8, 22))


### trial run

In [63]:
# Define the hashtag and timeframe
hashtag = "fitspo"
language = 'en'
place = 'us'

In [50]:
def scrape_tweets(sampled_weeks):
    '''
    Input:
        sampled_weeks: a list of tuples that contain the start and end date of the week
                        e.g.: (datetime.date(2019, 3, 22), datetime.date(2019, 3, 28))
    '''
    
    for week in sampled_weeks:
        start_date = str(week[0])
        end_date = str(week[1])

        # Define the snscrape command
        command = f'snscrape --jsonl twitter-hashtag "{hashtag} lang:{language} since:{start_date} until:{end_date}"'

        # Run the snscrape command and capture the output
        output = subprocess.check_output(command, shell=True)

        # Decode the JSON output
        tweets = [json.loads(line) for line in output.splitlines()]

In [81]:
start_date = "2019-03-22"
end_date = "2019-03-28"

# Define the snscrape command
# exclude retweets
command = f'snscrape --jsonl twitter-hashtag "{hashtag} lang:{language} since:{start_date} until:{end_date} -filter:retweets"'

# Run the snscrape command and capture the output
output = subprocess.check_output(command, shell=True)

# Decode the JSON output
tweets = [json.loads(line) for line in output.splitlines()]

In [None]:
# Header for the CSV file
header = ['Username', 'Date', 'URL', 'Content', 'Tweet ID', 'Mentioned Users', 'Longitude', 'Latitude',
          'Country Code', 'Full Name', 'User ID', 'Followers Count', 'User Description', 'User URL']

In [112]:
csv_rows = [[tweets[i]['username'], tweets[i]['date'], tweets[i]['url'], tweets[i]['rawContent'], 
             tweets[i]['id'], tweets[i]['mentionedUsers'], 
             tweets[i]['coordinates']['longitude'] if tweets[i].get('coordinates') else None,
             tweets[i]['coordinates']['latitude'] if tweets[i].get('coordinates') else None,
             tweets[i]['place']['countryCode'] if tweets[i].get('place') else None,
             tweets[i]['place']['fullName'] if tweets[i].get('place') else None,
             tweets[i]['user']['id'], tweets[i]['user']['followersCount'], 
             tweets[i]['user']['description'], tweets[i]['user']['url']]
            for i in range(5)]

In [120]:
with open('tweets.csv', 'w', newline='') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(header)  # Write the header to the file
    csvwriter.writerows(csv_rows)

In [95]:
len(tweets)

1409

In [86]:
type(tweets)

list

In [87]:
type(tweets[0])

dict

In [52]:
for t in tweets:
    print(t['date'])

2019-03-27T23:58:24+00:00
2019-03-27T23:56:17+00:00
2019-03-27T23:55:24+00:00
2019-03-27T23:55:05+00:00
2019-03-27T23:54:28+00:00
2019-03-27T23:53:18+00:00
2019-03-27T23:52:07+00:00
2019-03-27T23:48:05+00:00
2019-03-27T23:40:08+00:00
2019-03-27T23:35:06+00:00
2019-03-27T23:24:39+00:00
2019-03-27T23:19:21+00:00
2019-03-27T23:12:35+00:00
2019-03-27T23:00:41+00:00
2019-03-27T23:00:10+00:00
2019-03-27T22:48:13+00:00
2019-03-27T22:46:11+00:00
2019-03-27T22:38:03+00:00
2019-03-27T22:38:01+00:00
2019-03-27T22:20:08+00:00
2019-03-27T22:18:55+00:00
2019-03-27T22:18:12+00:00
2019-03-27T22:13:48+00:00
2019-03-27T22:13:44+00:00
2019-03-27T22:09:33+00:00
2019-03-27T22:06:08+00:00
2019-03-27T21:47:24+00:00
2019-03-27T21:39:58+00:00
2019-03-27T21:39:47+00:00
2019-03-27T21:02:17+00:00
2019-03-27T21:01:40+00:00
2019-03-27T21:01:03+00:00
2019-03-27T21:00:09+00:00
2019-03-27T20:55:27+00:00
2019-03-27T20:49:36+00:00
2019-03-27T20:46:05+00:00
2019-03-27T20:39:54+00:00
2019-03-27T20:38:30+00:00
2019-03-27T2

In [79]:
for k, i in tweets[0].items():
    print(f"{k}, {i}")

_type, snscrape.modules.twitter.Tweet
url, https://twitter.com/AlphaEliteTrain/status/1111054897397551104
date, 2019-03-27T23:58:24+00:00
rawContent, Step Up &amp; Step Out!! No Limitation!!! Feed your fears, &amp; your faith will starve. Feed your faith &amp; your fears will.  #motivation #fitness #fit #fitnessaddict #fitspo #iworkout… https://t.co/9I4GZ0v3O0
renderedContent, Step Up &amp; Step Out!! No Limitation!!! Feed your fears, &amp; your faith will starve. Feed your faith &amp; your fears will.  #motivation #fitness #fit #fitnessaddict #fitspo #iworkout… instagram.com/p/Bk-y9-FnhMc/…
id, 1111054897397551104
user, {'_type': 'snscrape.modules.twitter.User', 'username': 'AlphaEliteTrain', 'id': 3280675146, 'displayname': '🐐 Alpha Elite Training LLC', 'rawDescription': '▫️Sports Performance Athlete Training ▫️Semi Personal Private Trainer Gym ▫️All Purpose Fitness Training ▫️I AM THOUGH™️ Pers Page: @T_Hyde21', 'renderedDescription': '▫️Sports Performance Athlete Training ▫️Semi Pe

In [75]:
tweets[0]['place']

{'_type': 'snscrape.modules.twitter.Place',
 'id': '01e74c422dab6107',
 'fullName': 'Cinco Ranch, TX',
 'name': 'Cinco Ranch',
 'type': 'city',
 'country': 'United States',
 'countryCode': 'US'}

In [76]:
tweets[0]['place']['countryCode']

'US'

In [77]:
tweets[0]['place']['fullName']

'Cinco Ranch, TX'

In [78]:
tweets[0]['username']

'AlphaEliteTrain'

In [66]:
tweets[0]['date']

'2019-03-27T23:58:24+00:00'

In [54]:
tweets[0]['url']

'https://twitter.com/AlphaEliteTrain/status/1111054897397551104'

In [55]:
tweets[0]['rawContent']

'Step Up &amp; Step Out!! No Limitation!!! Feed your fears, &amp; your faith will starve. Feed your faith &amp; your fears will.  #motivation #fitness #fit #fitnessaddict #fitspo #iworkout… https://t.co/9I4GZ0v3O0'

In [82]:
tweets[0]['content']

'Step Up &amp; Step Out!! No Limitation!!! Feed your fears, &amp; your faith will starve. Feed your faith &amp; your fears will.  #motivation #fitness #fit #fitnessaddict #fitspo #iworkout… https://t.co/9I4GZ0v3O0'

In [83]:
tweets[0]['mentionedUsers']

In [88]:
tweets[0]['coordinates']

{'_type': 'snscrape.modules.twitter.Coordinates',
 'longitude': -95.7890472,
 'latitude': 29.68964}

In [91]:
tweets[0]['coordinates']['longitude']


-95.7890472

In [92]:
tweets[0]['coordinates']['latitude']

29.68964

In [56]:
tweets[0]['date']

'2019-03-27T23:58:24+00:00'

In [57]:
tweets[0]['id']

1111054897397551104

In [58]:
tweets[0]['user']

{'_type': 'snscrape.modules.twitter.User',
 'username': 'AlphaEliteTrain',
 'id': 3280675146,
 'displayname': '🐐 Alpha Elite Training LLC',
 'rawDescription': '▫️Sports Performance Athlete Training ▫️Semi Personal Private Trainer Gym ▫️All Purpose Fitness Training ▫️I AM THOUGH™️ Pers Page: @T_Hyde21',
 'renderedDescription': '▫️Sports Performance Athlete Training ▫️Semi Personal Private Trainer Gym ▫️All Purpose Fitness Training ▫️I AM THOUGH™️ Pers Page: @T_Hyde21',
 'descriptionLinks': None,
 'verified': False,
 'created': '2015-07-15T15:00:50+00:00',
 'followersCount': 590,
 'friendsCount': 584,
 'statusesCount': 48669,
 'favouritesCount': 21043,
 'listedCount': 12,
 'mediaCount': 3387,
 'location': '6610 359 Rd FulshearTX 77441',
 'protected': False,
 'link': {'_type': 'snscrape.modules.twitter.TextLink',
  'text': 'alphaelitetraining.net',
  'url': 'http://alphaelitetraining.net',
  'tcourl': 'https://t.co/LEDoc7E2Ns',
  'indices': [0, 23]},
 'profileImageUrl': 'https://pbs.twimg

In [70]:
tweets[0]['user']['url']

'https://twitter.com/AlphaEliteTrain'

In [59]:
tweets[0]['user']['id']

3280675146

In [60]:
tweets[0]['user']['followersCount']

590

In [18]:
tweets[0]['user']['description']

'Great selection of Health & Beauty Supplies at affordable prices! Free shipping to 185 countries. 45 days money back guarantee. Friendly customer service.'