# Import Required Libraries

In [1]:
import emoji
import os
import glob
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re

# Change Working Directory and Concatenate Multiple Datasets into One

In [2]:
path = 'C:\\Users\\melin\\Documents\\Springboard Data Science Career Track\\Capstone Projects\\Capstone Project 2\\COVID-19 Tweets\\CP2data'

# Check current working directory.
retval = os.getcwd()
print("Current working directory %s" % retval)

# Now change the directory
os.chdir(path)

# Check current working directory.
retval = os.getcwd()

print("Directory changed successfully %s" % retval)

Current working directory C:\Users\melin
Directory changed successfully C:\Users\melin\Documents\Springboard Data Science Career Track\Capstone Projects\Capstone Project 2\COVID-19 Tweets\CP2data


In [3]:
#Use glob to match the pattern ‘csv’
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [4]:
#combine all files in the list
df_tweets = pd.concat([pd.read_csv(f) for f in all_filenames ])

# Explore Dataset

In [5]:
#Check first 5 rows of df_tweets dataset
df_tweets.head()

Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,retweet_count,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang
0,1244051646071611394,860252856829587457,2020-03-29T00:00:00Z,IMSS_SanLuis,"Ante cualquier enfermedad respiratoria, no te ...",TweetDeck,,,,False,...,0,,,,1008,41,,2017-05-04T22:00:38Z,False,es
1,1244051645039706112,1125933654943895553,2020-03-29T00:00:00Z,intrac_ccs,#ATENCIÓN En el Terminal Nuevo Circo se implem...,TweetDeck,,,,False,...,1,,,,90,316,,2019-05-08T01:21:16Z,False,es
2,1244051645975191557,80943559,2020-03-29T00:00:00Z,rlieving,“People are just storing up. They are staying ...,TweetDeck,,,,False,...,0,,,,136,457,,2009-10-08T21:06:08Z,False,en
3,1244051646750928897,817072420947247104,2020-03-29T00:00:00Z,Tu_IMSS_Coah,"Si empezaste a trabajar, necesitas dar de alta...",TweetDeck,,,,False,...,0,,,,1549,170,,2017-01-05T18:17:00Z,False,es
4,1244051647032102914,788863557349670913,2020-03-29T00:00:00Z,Tabasco_IMSS,Una sociedad informada está mejor preparada an...,TweetDeck,,,,False,...,0,,,,868,125,,2016-10-19T22:05:03Z,False,es


In [6]:
#Get summary of df_tweets dataframe
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14607013 entries, 0 to 355386
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   status_id             int64  
 1   user_id               int64  
 2   created_at            object 
 3   screen_name           object 
 4   text                  object 
 5   source                object 
 6   reply_to_status_id    float64
 7   reply_to_user_id      float64
 8   reply_to_screen_name  object 
 9   is_quote              bool   
 10  is_retweet            bool   
 11  favourites_count      int64  
 12  retweet_count         int64  
 13  country_code          object 
 14  place_full_name       object 
 15  place_type            object 
 16  followers_count       int64  
 17  friends_count         int64  
 18  account_lang          float64
 19  account_created_at    object 
 20  verified              bool   
 21  lang                  object 
dtypes: bool(3), float64(3), int64(6), object(1

In [7]:
#Check sum of missing values in each column of df_tweets dataframe
df_tweets.isnull().sum()

status_id                      0
user_id                        0
created_at                     0
screen_name                    2
text                           0
source                        83
reply_to_status_id      12861933
reply_to_user_id        12495830
reply_to_screen_name    12495830
is_quote                       0
is_retweet                     0
favourites_count               0
retweet_count                  0
country_code            13950294
place_full_name         13947685
place_type              13947685
followers_count                0
friends_count                  0
account_lang            14607013
account_created_at             0
verified                       0
lang                           0
dtype: int64

# Manually Remove Redundant Columns and Rows

In [8]:
#Remove rows in which text are not English
df_tweets = df_tweets[df_tweets.lang == 'en']

In [9]:
#Remove columns with missing percentage of 60% or more
df_tweets = df_tweets.loc[:, df_tweets.isnull().mean() < .6]

In [10]:
#Remove redundant columns 
df_tweets = df_tweets.drop(['status_id', 'user_id'], axis = 1)
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8133785 entries, 2 to 355386
Data columns (total 13 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   created_at          object
 1   screen_name         object
 2   text                object
 3   source              object
 4   is_quote            bool  
 5   is_retweet          bool  
 6   favourites_count    int64 
 7   retweet_count       int64 
 8   followers_count     int64 
 9   friends_count       int64 
 10  account_created_at  object
 11  verified            bool  
 12  lang                object
dtypes: bool(3), int64(4), object(6)
memory usage: 705.9+ MB


In [11]:
#Randomly drop additional 90% of rows  
df_tweets = df_tweets.sample(frac=.1)
df_tweets.shape

(813378, 13)

In [12]:
# Convert created_at and account_created_at variables from object to datetime64 
df_tweets['created_at'] = df_tweets['created_at'].astype('datetime64')
df_tweets['account_created_at'] = df_tweets['account_created_at'].astype('datetime64')
# Check for earliest and latest tweet
print("Earliest tweet: ", df_tweets['created_at'].min())
print("Latest tweet: ", df_tweets['created_at'].max(), '\n')

Earliest tweet:  2020-03-29 00:00:00
Latest tweet:  2020-04-30 23:59:58 



# Clean Twitter Text

In [13]:
# function for converting emoticons to text
emoticons = {
':-)': 'happy / smile', 
':)': 'happy / smile', 
';)': 'wink / glad', 
':o)': 'happy / smile', 
':]': 'happy / smile', 
':3': 'happy / smile', 
':c)': 'happy / smile',
':>': 'happy / smile', 
'=]': 'happy / smile', 
'8)': 'happy / smile', 
'=)': 'happy / smile', 
':}': 'happy / smile',
':^)': 'happy / smile', 
':-D': 'laugh / big grin',
':D': 'laugh / big grin',
'8-D': 'laugh / big grin / laugh with glasses / wide-eyed surprise', 
'8D': 'laugh / big grin / laugh with glasses / wide-eyed surprise', 
'x-D': 'laugh', 
'xD': 'laugh', 
'X-D': 'laugh', 
'XD': 'laugh', 
'=-D': 'laugh / big grin', 
'=D': 'laugh / big grin',
'=-3': 'laugh / big grin', 
'=3': 'laugh / big grin', 
':-))': 'very happy / double chin', 
":'-)": 'tears of happiness', 
":')": 'tears of happiness', 
':*': 'kiss', 
':^*': 'kiss', 
'>:P': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
':-P': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
':P': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
'X-P': 'tongue sticking out / cheeky / playful / blowing a raspberry',
'x-p': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
'xp': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
'XP': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
':-p': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
':p': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
'=p': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
':-b': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
':b': 'tongue sticking out / cheeky / playful / blowing a raspberry', 
'>:)': 'devilish / cheeky / playful', 
'>;)': 'devilish / cheeky / playful / wink', 
'>:-)': 'devilish / cheeky / playful',
'<3': 'heart / love',
':L': 'skeptical / undecided / uneasy / hesitant', 
':-/': 'skeptical / undecided / uneasy / hesitant', 
'>:/': 'skeptical / annoyed / undecided / uneasy / hesitant', 
':S': 'skeptical / undecided / uneasy / hesitant', 
'>:[': 'frown / angry / pouting', 
':@': 'frown / sad / pouting', 
':-(': 'frown / sad / pouting', 
':[': 'frown / sad / pouting', 
':-||': 'frown / pouting', 
'=L': 'skeptical / undecided / uneasy / hesitant', 
':<': 'frown / sad / pouting',
':-[': 'frown / sad / pouting', 
':-<': 'frown / sad / pouting', 
'=\\': 'skeptical / undecided / uneasy / hesitant', 
'=/': 'skeptical / undecided / uneasy / hesitant', 
'>:(': 'skeptical / annoyed / undecided / uneasy / hesitant', 
':(': 'frown / sad / pouting', 
'>.<': 'frown / pouting', 
":'-(": 'cry', 
":'(": 'cry', 
':\\': 'skeptical / undecided / uneasy / hesitant', 
':-c': 'frown / sad / pouting',
':c': 'frown / sad / pouting', 
':{': 'frown / sad / pouting', 
'>:\\': 'skeptical / annoyed / undecided / uneasy / hesitant', 
';(': 'skeptical / annoyed / undecided / uneasy / hesitant'
}

def emoticon_text(text):
    words = text.split()
    reformed = [emoticons[word] if word in emoticons else word for word in words]
    text = " ".join(reformed)
    return text

In [14]:
#Apply converting emoticons to text function to text column in df_tweets dataframe
df_tweets['cleaned_text'] = df_tweets['text'].apply(lambda x: emoticon_text(x))

In [15]:
# function for replacing contractions
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

def replace_contractions(text):
    text = text.replace("’","'")
    words = text.split()
    reformed = [contractions[word] if word in contractions else word for word in words]
    text = " ".join(reformed)
    return text

In [16]:
#Apply replace contractions function to cleaned_text column in df_tweets dataframe
df_tweets['cleaned_text'] = df_tweets['cleaned_text'].apply(lambda x: replace_contractions(x))

In [17]:
# function for twitter text cleaning 
def tweet_cleaner(text):
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                  '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text) #remove url links
    text = re.sub("(@[A-Za-z0-9_]+)","", text) #remove twitter handles (@user)
    # convert emojis to text
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    # remove punctuations, numbers, and special characters 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase
    text = text.lower()
    return text

In [18]:
#Apply text cleaning function to cleaned_text column in df_tweets dataframe
df_tweets['cleaned_text'] = df_tweets['cleaned_text'].apply(lambda x: tweet_cleaner(x))

In [19]:
# function for lemmatizing words
def lemmatize_text(text):
    wl = WordNetLemmatizer()
    token_words=word_tokenize(str(text))
    token_words
    lemmatize_text=[]
    for word in token_words:
        lemmatize_text.append(wl.lemmatize(word))
        lemmatize_text.append(" ")
    return "".join(lemmatize_text)

In [20]:
#Apply lemmatizing text function to cleaned_text column in df_tweets dataframe
df_tweets['cleaned_text'] = df_tweets['cleaned_text'].apply(lambda x: lemmatize_text(x))

In [21]:
# function for stemming words
def stem_text(text):
    ps = PorterStemmer()
    token_words=word_tokenize(str(text))
    token_words
    stem_text=[]
    for word in token_words:
        stem_text.append(ps.stem(word))
        stem_text.append(" ")
    return "".join(stem_text)

In [22]:
#Apply stemming text function to cleaned_text column in df_tweets dataframe
df_tweets['cleaned_text'] = df_tweets['cleaned_text'].apply(lambda x: stem_text(x))

In [23]:
#Remove stop words in cleaned_text column

# function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    no_stopword_text = [w for w in str(text).split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [24]:
#Apply the remove stop words function to cleaned_text column in df_tweets dataframe
df_tweets['cleaned_text'] = df_tweets['cleaned_text'].apply(lambda x: remove_stopwords(x))

In [25]:
#Remove rows containing missing values under the cleaned_text column 
df_tweets = df_tweets[df_tweets['cleaned_text'].notnull()]

In [26]:
df_tweets.shape

(813378, 14)

In [27]:
df_tweets.head()

Unnamed: 0,created_at,screen_name,text,source,is_quote,is_retweet,favourites_count,retweet_count,followers_count,friends_count,account_created_at,verified,lang,cleaned_text
82964,2020-03-31 04:11:43,sannidhi_perla,"1/4\n@PMOIndia, Amid the #coronavirus lock dow...",Twitter Web App,False,False,666,2,116,25,2017-10-20 15:36:13,False,en,amid coronaviru lock thi humbl appeal behalf p...
546906,2020-04-01 21:49:12,amberbowens,"🤔Why are they talking about a ""war on drugs"" a...",Twitter Web App,False,False,608,0,163,224,2009-07-05 22:35:05,False,en,think face whi talk war drug drug cartel thi r...
277509,2020-04-17 14:36:04,bradyparker,Sen. John Thune on paycheck protection program...,IFTTT,False,False,8,0,4570,2610,2009-05-04 13:41:24,False,en,sen john thune paycheck protect program run mo...
14298,2020-04-13 02:13:07,Davidmetroland,@DavidGSmith18 @mindedmusically @Rudy48053087 ...,Twitter Web App,False,False,31610,0,990,820,2015-05-08 02:30:10,False,en,air travel make thi type pandem risk coronavir...
38635,2020-04-18 03:01:40,yipmann82,@omotforest @matteous_ @anileh2 @GuidoFawkes I...,Twitter for Android,False,False,2865,0,326,523,2015-08-20 09:50:46,False,en,see ani front liner say got enough ppe tell fo...


In [28]:
df_tweets.tail()

Unnamed: 0,created_at,screen_name,text,source,is_quote,is_retweet,favourites_count,retweet_count,followers_count,friends_count,account_created_at,verified,lang,cleaned_text
80505,2020-04-01 04:34:32,jay1stnewyorker,"Coronavirus: White House projects 100,000 to 2...",Twitter for iPhone,False,False,519,0,55,58,2012-05-09 09:41:47,False,en,coronaviru white hous project death u coronaviru
66469,2020-04-26 07:33:02,KevinJPringle,This week’s @SundayTimesScot column - a broade...,Twitter for iPhone,False,False,6630,4,9421,1265,2012-09-26 15:18:17,False,en,thi week column broader context decentralis ri...
391416,2020-04-22 22:24:36,leighleigh75,Same ole song and dance #Coronavirus,Twitter for iPhone,False,False,29062,0,421,1797,2009-09-25 12:26:06,False,en,ole song danc coronaviru
151084,2020-04-08 09:26:42,SKYRIDER4538,@UNNTV1 PETA PETA PETA \n\n@peta your silence ...,Twitter for iPhone,False,False,167017,8,117138,14269,2015-01-16 08:09:27,False,en,peta peta peta silenc deafen cri face pensiv f...
436533,2020-04-07 23:35:49,CHRYYING,today was a slow day \n\n#lofi #LofiHipHop #mu...,Twitter for iPhone,False,False,13,0,7,21,2016-06-19 07:43:56,False,en,today wa slow day lofi lofihiphop music covid ...


# Export Final Dataset into csv

In [29]:
df_tweets.to_csv (r'C:/Users/melin/Documents/Springboard Data Science Career Track/Capstone Projects/Capstone Project 2/df_tweets.csv', index = False, header=True)