# Text Cleaning

Here, we will perform basic text cleaning on the tweets.

Note: some basic cleaning of tweets was already performed during the collection process.

In [1]:
# Import packages
import pandas as pd
import re
import nltk

### Read in Raw Tweet Data

In [2]:
# Read in data
data = pd.read_csv('../Data_Merged/all_data.csv')

# View first few rows of data
data.head(3)

Unnamed: 0.1,Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,...,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic_y
0,0,2022-10-18 00:00:00,Sharp words on guns in Shane Hazel to Stacey A...,['gagovdebate'],[],[],5,24,,,...,25282846,SimonesNews,Simone Sebastian,Washington DC,3110,5830,1445,True,4400,abrams
1,1,2022-10-18 00:00:01,Stacey Abrams won tonight. She kept to the fac...,[],[],[],0,6,,,...,1312393604439183361,nching0,Thee Lost Edges of Candace 🪥,"34.2073° N, 84.1402° W",922,752,101529,False,61963,abrams
2,2,2022-10-18 00:00:01,"Why did Joe Rogan send his little brother, Sha...",['GAGovDebate'],[],[],0,5,,,...,897218253826555905,JTaylorSkinner,Jenn Taylor-Skinner (she/her),Seattle,17762,25727,82402,False,43808,abrams


### Perform Basic Text Cleaning

Remove punctuation, digits, and other unnecessary items from text

In [3]:
# Create regex statement for cleaning 
replace = [
    (r"@[A-Za-z0-9]+"," "),                                                             # Remove mentions
    (r"[\t\n\r\*\.\@\,\-\/\:\"\!\?\)\#\(\'\;\&\^\$\[\]\=\%\|\+\>\<\_\`\{\}\~\\]", " "), # Remove punctuation
    (r"\d+", " "),                                                                      # Remove digits
    (r"\s+", " "),                                                                      # Stip extra whitespace
]

# Loop through all tweets and apply regex cleaning 
train_sentences = []
for i, d in enumerate(data['text']):
    for repl in replace:
        d = re.sub(repl[0], repl[1], d)
    train_sentences.append(d)

# Add cleaned text to as column in dataframe
data['text_clean'] = train_sentences

# View first few rows of text and cleaned text columns
data[['text', 'text_clean']].head(10)

Unnamed: 0,text,text_clean
0,Sharp words on guns in Shane Hazel to Stacey A...,Sharp words on guns in Shane Hazel to Stacey A...
1,Stacey Abrams won tonight. She kept to the fac...,Stacey Abrams won tonight She kept to the fact...
2,"Why did Joe Rogan send his little brother, Sha...",Why did Joe Rogan send his little brother Shan...
3,Viral handbag designer and EBONY Power100 Styl...,Viral handbag designer and EBONY Power Style C...
4,THE MOST DANGEROUS THING FACING GEORGIA IS 4 M...,THE MOST DANGEROUS THING FACING GEORGIA IS MOR...
5,Mrs. Abrams showed the same poise and skill as...,Mrs Abrams showed the same poise and skill as ...
6,Thanks for the link BDD! Who was the other guy...,Thanks for the link BDD Who was the other guy ...
7,"Stacey Abrams: ""The most dangerous thing facin...",Stacey Abrams The most dangerous thing facing ...
8,"Stacey Abrams is on ,",Stacey Abrams is on
9,Yeah . But they dont have to . If he told them...,Yeah But they dont have to If he told them not...


Split words on the candidate name to handle issues where these are combined (ex: "Abramsis" -> "Abrams is") and lowercase words

In [4]:
# Define method to add space between certain delimiter and attached words,
# and to lowercase all words in tweet
def split_words(tweet, delimiter):
    tweet_split = tweet.split(delimiter)
    if len(tweet_split) > 1:
        new_tweet = []
        for section in tweet_split:
            new_tweet.append(section)
            new_tweet.append(delimiter)
        if tweet[-1*len(delimiter):] != delimiter:
            tweet = new_tweet[:-1]
        else:
            tweet = new_tweet
        tweet = ' '.join(tweet)
    tweet = tweet.lower()
    return tweet

# Apply method for 4 versions of candidate names
data['text_clean'] = data['text_clean'].apply(lambda x: split_words(x, 'Abrams'))
data['text_clean'] = data['text_clean'].apply(lambda x: split_words(x, 'abrams'))
data['text_clean'] = data['text_clean'].apply(lambda x: split_words(x, 'Kemp'))
data['text_clean'] = data['text_clean'].apply(lambda x: split_words(x, 'kemp'))

# View first few rows of text and cleaned text columns
data[['text', 'text_clean']].head(10)

Unnamed: 0,text,text_clean
0,Sharp words on guns in Shane Hazel to Stacey A...,sharp words on guns in shane hazel to stacey ...
1,Stacey Abrams won tonight. She kept to the fac...,stacey abrams won tonight she kept to the ...
2,"Why did Joe Rogan send his little brother, Sha...",why did joe rogan send his little brother shan...
3,Viral handbag designer and EBONY Power100 Styl...,viral handbag designer and ebony power style c...
4,THE MOST DANGEROUS THING FACING GEORGIA IS 4 M...,the most dangerous thing facing georgia is mor...
5,Mrs. Abrams showed the same poise and skill as...,mrs abrams showed the same poise and skill...
6,Thanks for the link BDD! Who was the other guy...,thanks for the link bdd who was the other guy ...
7,"Stacey Abrams: ""The most dangerous thing facin...",stacey abrams the most dangerous thing fac...
8,"Stacey Abrams is on ,",stacey abrams is on
9,Yeah . But they dont have to . If he told them...,yeah but they dont have to if he told them not...


Change miscellaneous mis-spellings or uninteresting words that were noticed

In [5]:
# Define method to handle random words
def handle_random_words(tweet):

    # Create regex statement for words/phrases to adjust 
    replace = [
        (r"kenp", "kemp"),    
        (r"georiga", "georgia"),
        (r"no(?:no)+", "no"),
        (r"^n+o+ | n+o+ | n+o+$", "no"),
        (r"^y+e+s+ | y+e+s+ | y+e+s+$", "yes"),
        (r"yes(?:yes)+", "yes"),
        (r"a+hh+", "ah"),
        (r"^um+ | um+ | um+$", "um"),
        (r"^aa+ | aa+ | aa+$", "a"),
        (r"^o+h+ | o+h+ | o+h+$", "oh"),
        (r"^shes | shes | shes$", "she is"),
        (r"^ga | ga | ga$", "georgia"),
    ]

    # Substitute words for common misspellings
    for repl in replace:
        tweet = re.sub(repl[0], repl[1], tweet)

    # Return cleaned tweet
    return tweet

# Apply method to cleaned_text column
data['text_clean'] = data['text_clean'].apply(lambda x: handle_random_words(x))

# View first few rows of text and cleaned text columns
data[['text', 'text_clean']].head(10)

Unnamed: 0,text,text_clean
0,Sharp words on guns in Shane Hazel to Stacey A...,sharp words on guns in shane hazel to stacey ...
1,Stacey Abrams won tonight. She kept to the fac...,stacey abrams won tonight she kept to the ...
2,"Why did Joe Rogan send his little brother, Sha...",why did joe rogan send his little brother shan...
3,Viral handbag designer and EBONY Power100 Styl...,viral handbag designer and ebony power style c...
4,THE MOST DANGEROUS THING FACING GEORGIA IS 4 M...,the most dangerous thing facing georgia is mor...
5,Mrs. Abrams showed the same poise and skill as...,mrs abrams showed the same poise and skill...
6,Thanks for the link BDD! Who was the other guy...,thanks for the link bdd who was the other guy ...
7,"Stacey Abrams: ""The most dangerous thing facin...",stacey abrams the most dangerous thing fac...
8,"Stacey Abrams is on ,",stacey abrams is on
9,Yeah . But they dont have to . If he told them...,yeah but they dont have to if he told them not...


In [6]:
all_words = list(set([word for tweet in list(data['text_clean']) for word in tweet.split()]))
all_words = sorted(all_words)
#print(all_words)

Add column with stopwords removed

In [7]:
# Get nltk stopwords
stopwords = nltk.corpus.stopwords.words('english')

# Define method to lowercase and remove stopwords from tweet
def remove_stopwords(tweet):
    tweet = [word for word in tweet.split() if word not in stopwords]
    tweet = ' '.join(tweet)
    return tweet

# Apply method to cleaned_text column
data['text_clean_noStop'] = data['text_clean'].apply(lambda x: remove_stopwords(x))

# View first few rows of text and cleaned text columns
data[['text', 'text_clean', 'text_clean_noStop']].head(10)

Unnamed: 0,text,text_clean,text_clean_noStop
0,Sharp words on guns in Shane Hazel to Stacey A...,sharp words on guns in shane hazel to stacey ...,sharp words guns shane hazel stacey abrams kee...
1,Stacey Abrams won tonight. She kept to the fac...,stacey abrams won tonight she kept to the ...,stacey abrams tonight kept facts answered ques...
2,"Why did Joe Rogan send his little brother, Sha...",why did joe rogan send his little brother shan...,joe rogan send little brother shane interrupt ...
3,Viral handbag designer and EBONY Power100 Styl...,viral handbag designer and ebony power style c...,viral handbag designer ebony power style curat...
4,THE MOST DANGEROUS THING FACING GEORGIA IS 4 M...,the most dangerous thing facing georgia is mor...,dangerous thing facing georgia years brian kem...
5,Mrs. Abrams showed the same poise and skill as...,mrs abrams showed the same poise and skill...,mrs abrams showed poise skill kbj earlier year...
6,Thanks for the link BDD! Who was the other guy...,thanks for the link bdd who was the other guy ...,thanks link bdd guy seems lot like republicans...
7,"Stacey Abrams: ""The most dangerous thing facin...",stacey abrams the most dangerous thing fac...,stacey abrams dangerous thing facing georgia f...
8,"Stacey Abrams is on ,",stacey abrams is on,stacey abrams
9,Yeah . But they dont have to . If he told them...,yeah but they dont have to if he told them not...,yeah dont told vote brian kemp still would hes...


Add column with stopwords and search words removed

In [8]:
# Define search words used in collecting tweets 
search_words = ['georgia', 'governor', 'gov', 'election', 'ga',
                'brian', 'kemp', 'kemps', 'brainkemp', 'kemp',
                'stacey', 'abrams', 'staceyabrams', 'abramss',
                'nominee', 'candidate', 'incumbent']

# Define method to lowercase and remove stopwords and search words from tweet
def remove_searchwords(tweet):
    tweet = [word for word in tweet.split() if word not in search_words]
    tweet = ' '.join(tweet)
    return tweet

# Apply method to cleaned_text column
data['text_clean_noStop_noSearch'] = data['text_clean_noStop'].apply(lambda x: remove_searchwords(x))

# View first few rows of text and cleaned text columns
data[['text', 'text_clean', 'text_clean_noStop', 'text_clean_noStop_noSearch']].head(10)

Unnamed: 0,text,text_clean,text_clean_noStop,text_clean_noStop_noSearch
0,Sharp words on guns in Shane Hazel to Stacey A...,sharp words on guns in shane hazel to stacey ...,sharp words guns shane hazel stacey abrams kee...,sharp words guns shane hazel keep going back g...
1,Stacey Abrams won tonight. She kept to the fac...,stacey abrams won tonight she kept to the ...,stacey abrams tonight kept facts answered ques...,tonight kept facts answered questions math poi...
2,"Why did Joe Rogan send his little brother, Sha...",why did joe rogan send his little brother shan...,joe rogan send little brother shane interrupt ...,joe rogan send little brother shane interrupt ...
3,Viral handbag designer and EBONY Power100 Styl...,viral handbag designer and ebony power style c...,viral handbag designer ebony power style curat...,viral handbag designer ebony power style curat...
4,THE MOST DANGEROUS THING FACING GEORGIA IS 4 M...,the most dangerous thing facing georgia is mor...,dangerous thing facing georgia years brian kem...,dangerous thing facing years wow
5,Mrs. Abrams showed the same poise and skill as...,mrs abrams showed the same poise and skill...,mrs abrams showed poise skill kbj earlier year...,mrs showed poise skill kbj earlier year face g...
6,Thanks for the link BDD! Who was the other guy...,thanks for the link bdd who was the other guy ...,thanks link bdd guy seems lot like republicans...,thanks link bdd guy seems lot like republicans...
7,"Stacey Abrams: ""The most dangerous thing facin...",stacey abrams the most dangerous thing fac...,stacey abrams dangerous thing facing georgia f...,dangerous thing facing four years
8,"Stacey Abrams is on ,",stacey abrams is on,stacey abrams,
9,Yeah . But they dont have to . If he told them...,yeah but they dont have to if he told them not...,yeah dont told vote brian kemp still would hes...,yeah dont told vote still would hes got black ...


### Clean Dataframe Columns

Remove unnecessary first column

In [9]:
# Remove fist column
data.drop(['Unnamed: 0'], axis=1, inplace=True)

Rename the "topic_y" column

In [10]:
# Rename "topic_y" column to be "topic"
data.rename(columns={'topic_y':'topic'}, inplace=True)
data.columns

Index(['created_at', 'text', 'hashtags', 'user_mention_ids',
       'user_mention_screen_names', 'retweet_count', 'favorite_count',
       'in_reply_to_user_id', 'in_reply_to_screen_name', 'geo', 'coordinates',
       'user_id', 'user_screen_name', 'user_name', 'user_location',
       'user_friends_count', 'user_followers_count', 'user_favourites_count',
       'user_verfied', 'user_statuses_count', 'topic', 'text_clean',
       'text_clean_noStop', 'text_clean_noStop_noSearch'],
      dtype='object')

Get counds of missing values across all columns

In [11]:
# Get counts and percents of rows with missing data for each column
missing_df = pd.DataFrame(data.isna().sum(), columns=['num_missing'])
missing_df.reset_index(inplace=True)
missing_df.rename(columns={'index':'column'}, inplace=True)
missing_df['percent_missing'] = missing_df['num_missing']/len(data)*100
missing_df

Unnamed: 0,column,num_missing,percent_missing
0,created_at,0,0.0
1,text,0,0.0
2,hashtags,0,0.0
3,user_mention_ids,0,0.0
4,user_mention_screen_names,0,0.0
5,retweet_count,0,0.0
6,favorite_count,0,0.0
7,in_reply_to_user_id,25143,48.977326
8,in_reply_to_screen_name,25143,48.977326
9,geo,51324,99.976625


Since `in_reply_to_user_id` and `in_reply_to_screen_name` have missing data for almost 50% of their rows and `geo` and `coordinates` have data missing for almost 100% of their rows, we will remove these 4 columns.

In [12]:
# Drop the 4 columns indicated
data.drop(['in_reply_to_user_id', 'in_reply_to_screen_name', 'geo', 'coordinates'], axis=1, inplace=True)

# See updated column of data
data.columns

Index(['created_at', 'text', 'hashtags', 'user_mention_ids',
       'user_mention_screen_names', 'retweet_count', 'favorite_count',
       'user_id', 'user_screen_name', 'user_name', 'user_location',
       'user_friends_count', 'user_followers_count', 'user_favourites_count',
       'user_verfied', 'user_statuses_count', 'topic', 'text_clean',
       'text_clean_noStop', 'text_clean_noStop_noSearch'],
      dtype='object')

### Save Dataframe with Cleaned Text

In [13]:
data.to_csv('./cleaned_tweet_data.csv', index=False)