In [1]:
# sample time
import random
import datetime
import time

# scraping
import subprocess
import json

# data collection
import pandas as pd
import csv
import os

According to Kim et al. (2018), it is necessary to include around 38% of Twitter content in order to represent the entire period.

In [2]:
def sample_weeks(start_date, end_date):
    '''
    Sample about 38% of the weeks of start date to end date
    Input:
        Two dates in datetime.date format
    Output:
        A list of sampled dates that are ending cutoff points of the sampled weeks
    '''

    delta = datetime.timedelta(days=7) # 7-day time period

    current_date = start_date

    weeks = []

    while current_date <= end_date:
        period_start = current_date
        period_end = current_date + delta - datetime.timedelta(days=1)
        # the last end date can't exceed the sampling period
        if period_end > end_date:
            period_end = end_date
        weeks.append((period_start, period_end))
        current_date += delta

    sample_size = int(len(weeks) * 0.38)
    sampled_weeks = random.sample(weeks, sample_size)
    
    return sampled_weeks

The entire study period was categorized into the following three time periods: 
- pre-pandemic (March 1, 2019 - December 31, 2019)
- early-pandemic (March 1, 2020 - December 31, 2020)
- late-pandemic (March 1, 2022 - December 31, 2022)

In [3]:
# define the start/end dates of the three periods
# pre-pandemic period
p1_start = datetime.date(2019, 3, 1)
p1_end = datetime.date(2019, 12, 31)

# early-pandemic
p2_start = datetime.date(2020, 3, 1)
p2_end = datetime.date(2020, 12, 31)

# late-pandemic
p3_start = datetime.date(2022, 3, 1)
p3_end = datetime.date(2022, 12, 31)

In [4]:
# random sample the weeks for each period based on the pre-defined start and end date
p1_sample = sample_weeks(p1_start, p1_end)
p2_sample = sample_weeks(p2_start, p2_end)
p3_sample = sample_weeks(p3_start, p3_end)

In [30]:
# Each period has 16 weeks sampled
len(p1_sample) == len(p2_sample) == len(p3_sample) == 16

True

In [6]:
# Define the hashtag and timeframe
hashtag = "fitspo"
language = 'en'

In [None]:
# If not already installed
#! pip install snscrape

In [14]:
def scrape_tweets(sampled_weeks, period_id):
    '''
    Input:
        sampled_weeks: a list of tuples that contain the start and end date of the week
                        e.g.: (datetime.date(2019, 3, 22), datetime.date(2019, 3, 28))
        period_id: 
            0: pre-pandemic, 1: early-pandemic, 2: late-pandemic
    '''
    
    # store all rows
    csv_rows = []

    for week in sampled_weeks:
        start_date = str(week[0])
        end_date = str(week[1])

        # Define the snscrape command, excluding retweets
        command = f'snscrape --jsonl twitter-hashtag "{hashtag} lang:{language} since:{start_date} until:{end_date} -filter:retweets"'

        # Run the snscrape command and capture the output
        output = subprocess.check_output(command, shell=True)

        # Decode the JSON output
        tweets = [json.loads(line) for line in output.splitlines()]

        rows = [[period_id, t['username'], t['date'], t['url'], t['rawContent'], 
                t['id'], t['mentionedUsers'], 
                t['coordinates']['longitude'] if t.get('coordinates') else None,
                t['coordinates']['latitude'] if t.get('coordinates') else None,
                t['place']['countryCode'] if t.get('place') else None,
                t['place']['fullName'] if t.get('place') else None,
                t['user']['id'], t['user']['followersCount'], 
                t['user']['description'], t['user']['url']]
                for t in tweets]
        
        csv_rows.extend(rows)
        time.sleep(10)
    print('Finished scraping for one period!')

    return csv_rows

In [15]:
# Header for the CSV file
header = ['Period', 'Username', 'Date', 'URL', 'Content', 'TweetID', 'MentionedUsers', 'Longitude', 'Latitude',
          'CountryCode', 'Place', 'UserID', 'FollowersCount', 'UserDescription', 'UserURL']

# Get the current working directory path
cwd = os.getcwd()

# Print the updated directory path
dir_path = cwd.rstrip("/raw") + "/data"
print(dir_path)

# Write the header to the file
with open(dir_path + "/raw_data.csv", 'w', newline='') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(header)  

/Users/jiayan/Downloads/codes_macs_2022-2023/macs30200/replication-materials-jiayanli/data


In [16]:
# Loop over samples of three periods
for i, sample in enumerate([p1_sample, p2_sample, p3_sample]):
    csv_rows = scrape_tweets(sample, i)

    # Append data to the file
    with open(dir_path + "/raw_data.csv", 'a', newline='') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerows(csv_rows)

    print(f'Finished adding data of period {i} to csv file')

Finished scraping for one period!
Finished adding data of period 0 to csv file
Finished scraping for one period!
Finished adding data of period 1 to csv file
Finished scraping for one period!
Finished adding data of period 2 to csv file


In [31]:
# read csv file
df = pd.read_csv(dir_path + "/raw_data.csv")
df.tail()

Unnamed: 0,Period,Username,Date,URL,Content,TweetID,MentionedUsers,Longitude,Latitude,CountryCode,Place,UserID,FollowersCount,UserDescription,UserURL
27129,2,CollinsGato,2022-06-28T03:33:13+00:00,https://twitter.com/CollinsGato/status/1541625...,#backinthehabit #8weekstogo #gym #fitspo #fitn...,1541625739916836864,,,,,,2482589021,70,,https://twitter.com/CollinsGato
27130,2,GetFitNLean,2022-06-28T03:00:11+00:00,https://twitter.com/GetFitNLean/status/1541617...,Work It #FitFam! Be More! Do MORE! #Fitness #m...,1541617429356453888,,,,,,705040642179727361,7811,Motivation #Fitness #Fitfam #FitSpo pictures a...,https://twitter.com/GetFitNLean
27131,2,Obi_Obadike,2022-06-28T02:31:33+00:00,https://twitter.com/Obi_Obadike/status/1541610...,The mind controls the body.\n •\n#motivation #...,1541610222078140416,,,,,,37002399,543571,Best Selling Co-Author of TheCut; Named Top 10...,https://twitter.com/Obi_Obadike
27132,2,GetFitNLean,2022-06-28T02:20:10+00:00,https://twitter.com/GetFitNLean/status/1541607...,#Fitspo Strong is the NEW Skinny! #GetFITnLEAN...,1541607357733642240,,,,,,705040642179727361,7811,Motivation #Fitness #Fitfam #FitSpo pictures a...,https://twitter.com/GetFitNLean
27133,2,GetFitNLean,2022-06-28T00:40:10+00:00,https://twitter.com/GetFitNLean/status/1541582...,#FitSpo Deadlifts for great hamstrings and glu...,1541582191016157185,,,,,,705040642179727361,7811,Motivation #Fitness #Fitfam #FitSpo pictures a...,https://twitter.com/GetFitNLean


In [20]:
# Show data size
df.shape

(27134, 15)

In [23]:
# Check null values for each variable
df.isnull().sum()

Period                 0
Username               0
Date                   0
URL                    0
Content                0
TweetID                0
MentionedUsers     24300
Longitude          23750
Latitude           23750
CountryCode        23752
Place              23752
UserID                 0
FollowersCount         0
UserDescription     2048
UserURL                0
dtype: int64

In [27]:
print(f"Pre-pandemic period sample size: {df[df['Period'] == 0].shape[0]}")
print(f"Early-pandemic period sample size: {df[df['Period'] == 1].shape[0]}")
print(f"Late-pandemic period sample size: {df[df['Period'] == 2].shape[0]}")

Pre-pandemic period sample size: 15287
Early-pandemic period sample size: 8597
Late-pandemic period sample size: 3250


16 weeks were randomly sampled from each of the pre-, early-, and late-pandemic periods. A total of 27134 tweets posted within the sampled weeks with the hashtag "#fitspo" were collected. 

From the above, we can conclude that:

- The majority of tweets do not have information on the posting location (including longitude, latitude, country, and place) and do not mention any other users.
- A few tweets are posted by users who do not have a user description.
- The sample size volume: Pre-pandemic > Early-pandemic > Late-pandemic