In [1]:
import os
import sys
sys.path.insert(0, '..')
import utils
import pandas as pd
# set the display option to show full string value
pd.set_option('display.max_colwidth', None)

# text cleaning
import re
import emoji
from wordsegment import load, segment
#from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize
#from nltk.stem import WordNetLemmatizer

In [2]:
# Get the current working directory path
cwd = os.getcwd()

# Print the updated directory path
data_path = cwd.rstrip("/clean") + "/data"
data_path

'/Users/jiayan/Downloads/codes_macs_2022-2023/macs30200/replication-materials-jiayanli/data'

In [27]:
# read csv
df_raw = pd.read_csv(data_path + "/raw_data.csv")

## Exploratory Data Analysis

In [28]:
df_raw.shape

(27134, 15)

In [29]:
# Print out the first lines 
# Period 0: pre-pandemic, 1: early-pandemic, 2: late-pandemic
utils.show_random_5(df_raw, df_raw.columns.tolist())

Period                                                                                                                                                                               1
Username                                                                                                                                                                    pdaddy1114
Date                                                                                                                                                         2020-05-24T16:23:04+00:00
URL                                                                                                                          https://twitter.com/pdaddy1114/status/1264592765389869059
Content            Sunday stuff #trusttheprocess #jgsd #fundaysunday #inertiawave #inertiawaveandbosu #fitfam #fitspo #sundaystuff #keeponkeepinon #cya #imout https://t.co/bKf1rOO6Jp
TweetID                                                                              

In [30]:
# Inspect an exmaple of tweet content
df_raw['Content'][160]

'Tuesday—Back and Biceps 💪🏼 \nSome plate curls as finisher.\n.\n.\n.\n.\n.\n.\n.\n.\n#bicep #fitness #bodybuilding #gym #biceps #muscle #fitfam #fit #tricep #girlswholift #arms #back #workout #fitspo #strong #motivation… https://t.co/Bjyeya4bpU'

## Pre-processing Tweets

In [31]:
# Initiate the word segmentation tool
load() 

# Define the preprocessing function 
def preprocess_tweet(tweet):
    # conver emojis to their text representation
    # tweet = emoji.demojize(tweet)

    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)

    # Remove emojis
    tweet = tweet.encode('ascii', 'ignore').decode('utf-8')
    
    # Remove numbers
    tweet = re.sub(r'\d+', ' ', tweet)

    # Extract hashtags
    hashtags = re.findall(r'#\S+', tweet.lower())

    # Remove mentions (@), hashtags (#) and '\n'
    tweet = re.sub(r'[@]\S+|[#]\S+|\n', ' ', tweet)

    # Remove emoticons or special characters
    tweet = re.sub(r'[^a-zA-Z0-9\s\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', ' ', tweet)

    # Remove extra whitespace
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    
    # Remove stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in utils.STOP]).lower()
    '''
    # word segmentation, e.g., "hellothere" -> ["hello", "there"]
    try:
        word_list = segment(tweet)
        tweet = ' '.join(word_list)
    except:
        pass
    '''
    
    return tweet, hashtags

The pre-processing pipeline for VADER sentiment analysis includes:
- convert emojis to corresponding text ([Emoji for Python librariy](https://pypi.org/project/emoji/)) (it messes up with LDA)
- remove URLs, mentions, emoticons or special characters, extra whitespace, and stopwords
- lowercase the tweet content as well as the hashtags
- store hashtags in a list for future use
- word segmentation ([WordSegment library](https://pypi.org/project/wordsegment/) )

Because tokenizing the text may actually disrupt some of the features that VADER relies on, such as the context in which certain words appear, I did not tokenize here, but before other analyses.

In [32]:
# Split the resulting tuple into two separate columns
df_raw[['processed_tweet', 'hashtags']] = df_raw['Content'].apply(lambda x: pd.Series(preprocess_tweet(x)))

In [35]:
df_raw.iloc[160][['Content', 'processed_tweet']]

Content            Tuesday—Back and Biceps 💪🏼 \nSome plate curls as finisher.\n.\n.\n.\n.\n.\n.\n.\n.\n#bicep #fitness #bodybuilding #gym #biceps #muscle #fitfam #fit #tricep #girlswholift #arms #back #workout #fitspo #strong #motivation… https://t.co/Bjyeya4bpU
processed_tweet                                                                                                                                                                                                           tuesdayback biceps some plate curls finisher
Name: 160, dtype: object

In [36]:
utils.show_random_5(df_raw, ['processed_tweet', 'hashtags'])

processed_tweet    working beach never looked great
hashtags           [#fitfam, #fitspo, #getfitnlean]
Name: 26539, dtype: object
processed_tweet                   best till this helped write blog post thanks creators work
hashtags           [#workout, #fitnessphysique, #fitness, #fitnessmodels, #gymlife, #fitspo]
Name: 14143, dtype: object
processed_tweet                                                                                                           thank much business we honored clients like
hashtags           [#wellness, #training, #weightwatchers, #gymlife, #cardio, #fitnessjourney, #fitspo, #slimmingworlduk, #detox, #losingweight, #sw, #ww, #ketodiet]
Name: 20956, dtype: object
processed_tweet                                                                                                                                                                                            baby spam
hashtags           [#newmummy, #mummy, #mumofone, #newbaby, #baby, #babygirl, #child

In [37]:
# Remove exact duplicates based on tweet text
print(f"Before removing tweets: {df_raw.shape}")
df_raw.drop_duplicates(subset=['processed_tweet'], inplace=True)
df_raw.reset_index(drop=True, inplace=True)
print(f"After removing tweets: {df_raw.shape}")

Before removing tweets: (27134, 17)
After removing tweets: (19698, 17)


About seven thousand tweets were removed from the raw dataset because of being exact duplications.

In [38]:
# Remove empty tweets
print(f"Before removing empty-token tweets: {df_raw.shape}")
df_raw = df_raw[df_raw['processed_tweet'] != '']
df_raw.dropna(subset=['processed_tweet'], inplace=True)
print(f"After removing empty-token tweets: {df_raw.shape}")

Before removing empty-token tweets: (19698, 17)
After removing empty-token tweets: (19697, 17)


1 tweet was removed from the raw dataset because of being empty.

In [39]:
# Export the pre-processed dataset
df_raw.to_csv(data_path + '/processed.csv', index=False)