In [24]:
import os
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [20]:
# Get the current working directory path
cwd = os.getcwd()

# Print the updated directory path
data_path = cwd.rstrip("/clean") + "/data"
data_path

'/Users/jiayan/Downloads/codes_macs_2022-2023/macs30200/replication-materials-jiayanli/data'

In [28]:
# read csv
df_raw = pd.read_csv(data_path + "/raw_data.csv")

In [29]:
# Print out the first lines 
# Period 0: pre-pandemic, 1: early-pandemic, 2: late-pandemic
df_raw.head()

Unnamed: 0,Period,Username,Date,URL,Content,TweetID,MentionedUsers,Longitude,Latitude,CountryCode,Place,UserID,FollowersCount,UserDescription,UserURL
0,0,tewillmott,2019-08-14T23:59:42+00:00,https://twitter.com/tewillmott/status/11617895...,Be your own motivation. Sweat today smile tom...,1161789524902440960,,,,,,2378977625,239,Make up artist for film and television. Also ...,https://twitter.com/tewillmott
1,0,tewillmott,2019-08-14T23:50:39+00:00,https://twitter.com/tewillmott/status/11617872...,Time to kill some FAT 🥵🥵.\n.\n.\n.\n#health #f...,1161787246460096512,,,,,,2378977625,239,Make up artist for film and television. Also ...,https://twitter.com/tewillmott
2,0,tewillmott,2019-08-14T23:45:06+00:00,https://twitter.com/tewillmott/status/11617858...,Challenge yourself every day.\n..\n.\n.\n#heal...,1161785852357308416,,,,,,2378977625,239,Make up artist for film and television. Also ...,https://twitter.com/tewillmott
3,0,tewillmott,2019-08-14T23:40:11+00:00,https://twitter.com/tewillmott/status/11617846...,Excuses don’t burn calories.\n..\n.\n.\n#healt...,1161784615822155777,,,,,,2378977625,239,Make up artist for film and television. Also ...,https://twitter.com/tewillmott
4,0,tewillmott,2019-08-14T23:36:21+00:00,https://twitter.com/tewillmott/status/11617836...,Let’s get the legs right.\n.\n.\n#health #fitn...,1161783648443482112,,,,,,2378977625,239,Make up artist for film and television. Also ...,https://twitter.com/tewillmott


In [30]:
# Inspect an exmaple of tweet content
df_raw['Content'][0]

'Be your own motivation.  Sweat today smile tomorrow \n.\n.\n.\n#health #fitness #fit #fitmom #fitnessmodel #fitnessaddict #fitspo #workout #bodybuilding #cardio #gym #train #training #photooftheday #health #healthy… https://t.co/R6gFBTUXdT'

In [31]:
# Instantiate NLTK's WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define stopwords
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define fitspirational stop words
fit_stopwords = {'fit', 'fitness', 'gym', 'workout', 'exercise'}

# Add fitspirational stopwords
stop_words.update(fit_stopwords)

In [32]:
# Define the preprocessing function
def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    
    # Remove mentions (@), hashtags (#), puctuation, and '\n'
    tweet = re.sub(r'[@#]\S+|\n|[^\w\s]', '', tweet)
    
    # Tokenize the tweet
    tokens = word_tokenize(tweet.lower())
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Join the tokens back into a string
    tweet = ' '.join(tokens)
    
    return tweet

In [33]:
df_raw['Content'] = df_raw['Content'].apply(preprocess_tweet)
df_raw['Content'][0]

'motivation sweat today smile tomorrow'

In [34]:
# Remove exact duplicates based on tweet text
print(f"Before removing tweets: {df_raw.shape}")
df_raw.drop_duplicates(subset=['Content'], inplace=True)
df_raw.reset_index(drop=True, inplace=True)
print(f"After removing tweets: {df_raw.shape}")

Before removing tweets: (27134, 15)
After removing tweets: (19442, 15)


In [41]:
# Remove empty tweets
print(f"Before removing empty-token tweets: {df_raw.shape}")
df_raw = df_raw[df_raw['Content'] != '']
print(f"After removing empty-token tweets: {df_raw.shape}")

Before removing empty-token tweets: (19442, 15)
After removing empty-token tweets: (19441, 15)


In [42]:
# Export the pre-processed dataset
df_raw.to_csv(data_path + '/clean_data.csv')