In [1]:
#Space to install new packages

In [2]:
#Import packages
import os
import re
import pandas as pd
import json

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

In [3]:
path = os.getcwd() + "/covid19_tweets.csv"
raw_data = pd.read_csv(path)
raw_data = raw_data.drop(labels='Unnamed: 0', axis=1)
raw_data.head(5)

Unnamed: 0,Date,User,Tweet
0,2020-09-29 23:54:08+00:00,SteveDeaceShow,4 predictions for #PresidentialDebate: \n\n1) ...
1,2020-09-29 23:47:27+00:00,RealMed3,So I guess we are now just back to being healt...
2,2020-09-29 23:44:28+00:00,SSWorks,Donald Trump recently claimed that #COVID19 “a...
3,2020-09-29 23:42:22+00:00,NVPatriotGirl,Remember that time all of the #liberals said t...
4,2020-09-29 23:36:49+00:00,MelissaLMRogers,Another lockdown will be catastrophic for smal...


In [4]:
#Tokenize with TweetTokenizer

tweet_tokenizer = TweetTokenizer()
#filter out @usernames, links and emojis and punctuation tokens
pattern = r"(?:\@[\w_]+|https?\://\S+|[\U00010000-\U0010ffff]|\w*[^\w\s]+\w*)"


tweets_tokenized = []
for sentence in raw_data.iloc[:, 2]:
    tokens = tweet_tokenizer.tokenize(sentence)
    filtered_tokens = [re.sub(r'^#', '', token.lower()) for token in tokens if not re.match(pattern, token)]
    tweets_tokenized.append(filtered_tokens)

In [5]:
#Remove stop words

stop_words = set(stopwords.words('english'))
stop_words.update(('covid', 'corona', 'coronavirus', 'covid19'))
tweets_stripped = []
for sentence in tweets_tokenized:
    tweets_stripped.append([w for w in sentence if (not w.lower() in stop_words and not w.isdigit())])

In [6]:
tweets_stripped.remove(tweets_stripped[8048]) #this tweet was left empty after all the pre-processing, so it will be removed

In [7]:
#Use a lemmatizer to remove variation

tweets_clean = []

for tweet in tweets_stripped:
    current_stemmed_tweet = []
    for word in tweet:
        stem = WordNetLemmatizer().lemmatize(word)
        current_stemmed_tweet += [stem]
    tweets_clean += [current_stemmed_tweet]

with open('tweets_clean','w') as file:
    json.dump(tweets_clean, file)

In [8]:
#Rejoin each tokenized and preprocessed tweet into strings
tweets_strings = []
for tweet in tweets_clean:
    tweets_strings += [' '.join(tweet)]

len(tweets_strings)

with open('tweets_strings', 'w') as file:
    json.dump(tweets_strings, file)

In [9]:
tweets_strings[:10]

['prediction biden focus trump focus returning america prior lockdown tying democrat civil unrest dementia display debate move needle',
 'guess back healthcare worker healthcare hero',
 'donald trump recently claimed affect virtually nobody except elderly people people died nobody senior disposable',
 'remember time said nevada rally rally would cause spike infection state seen plummet infection hospitalization thanks playing try',
 'another lockdown catastrophic small business mental health people already started lockdown day causing much negative impact',
 'hey chris wallace number death united state would disservice viewer country bring',
 'donald coming state one party mean rally hotspot donate get killer come back except maybe federal prison oxford',
 'yesterday told usa releasing 150m rapid test today utter lack planning guidance creating problem approach throw thing fence state say take deal problem per',
 'sisolak event fix seating capacity includes allegiant stadium mean capac