# Data Preparation for NLP Analysis

In [3]:
# Import packages 
import pandas as pd
import numpy as np 
import re

## Read in Data

In [5]:
# Read in data 
data = pd.read_csv('./Data/all_data.csv')

# Understand output of data
data.head(2)

Unnamed: 0.1,Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,...,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic_y
0,0,2022-10-18 00:00:00,Sharp words on guns in Shane Hazel to Stacey A...,['gagovdebate'],[],[],5,24,,,...,25282846,SimonesNews,Simone Sebastian,Washington DC,3110,5830,1445,True,4400,abrams
1,1,2022-10-18 00:00:01,Stacey Abrams won tonight. She kept to the fac...,[],[],[],0,6,,,...,1312393604439183361,nching0,Thee Lost Edges of Candace 🪥,"34.2073° N, 84.1402° W",922,752,101529,False,61963,abrams


### Basic Data Quality Checks 

In [9]:
# Check if there are any rows where with no text data
print('Number of Rows with No Text Data:', data['text'].isnull().values.any().sum())

Number of Rows with No Text Data: 0


In [10]:
# Get number of Tweets in our corpus 
print('Number of Tweets:', len(data['text']))

Number of Tweets: 51336


From the basic data quality check we see that every Tweet returned does contain text. With this information, we can now move onto the text cleaning and pre-processing stages. 

## Text Preprocessing 

We will now clean the text by removing excess punctuation, spaces, and special characters. 

In [89]:
# regex statement for cleaning 
replace = [
    (r"(?<=\d),(?=\d)", ""),        # Remove commas in numbers
    (r"\d+", "number"),             # Map digits to special token <numbr>
    (r"[\t\n\r\*\.\@\,\-\/]", " "), # Punctuation and other junk
    (r"\s+", " ")                   # Stips extra whitespace
]

# looping through all Tweets and applying regex cleaning 
train_sentences = []
for i, d in enumerate(data['text']):
    for repl in replace:
        d = re.sub(repl[0], repl[1], d)
    train_sentences.append(d)

# writing output of regex cleaning to df column 
data['cleaned_text'] = train_sentences

In [90]:
data.head(2)

Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,...,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,cleaned_text
0,2022-11-03 13:23:56,I know these truths. Stacey Abrams will lose t...,[],"[1187835922118787073, 948223698649124870]","['lavern_spicer', 'raytoutofer']",0,0,1.187836e+18,lavern_spicer,,...,1541786385899819009,larrymondello63,Maxwell Smart 86,,89,8,277,False,476,I know these truths Stacey Abrams will lose th...
1,2022-11-03 13:23:45,Sort of like Stacey Abrams is still governor o...,[],[91882544],['DineshDSouza'],0,0,91882540.0,DineshDSouza,,...,1572885397285257216,KevinFodor2,Kevin Fodor,,717,100,1653,False,1079,Sort of like Stacey Abrams is still governor o...


We now have a data frame with a new column `cleaned_text` that contains the cleaned version of each Tweet. 

## Understanding the Cleaned Tweets 

### VADER Sentiment Analysis

In [91]:
# loading in sentiment libraries 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [92]:
# loading VADER sentiment model 
vader_sentiment = SentimentIntensityAnalyzer()

# function that returns sentiment score for a series of text 
def vader_sentiment_scores(text):
  score = vader_sentiment.polarity_scores(text)
  
  return score['compound']

In [93]:
# function that bins sentiment into positive, negative, and neutral categories based on sentiment score 
def format_output(row):
  polarity = "neutral"
  if(row>= 0.05):
    polarity = "positive"
  elif(row<= -0.05):
    polarity = "negative"

  return polarity

In [94]:
# getting sentiment scores for corpus of text 
data['sentiment_score'] = data['text'].apply(vader_sentiment_scores)

# getting sentiment bins for our data 
data['sentiment_bin'] = data['sentiment_score'].apply(format_output)

In [95]:
# examining data frame with sentiment scores and bins
data.head(3)

Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,...,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,cleaned_text,sentiment_score,sentiment_bin
0,2022-11-03 13:23:56,I know these truths. Stacey Abrams will lose t...,[],"[1187835922118787073, 948223698649124870]","['lavern_spicer', 'raytoutofer']",0,0,1.187836e+18,lavern_spicer,,...,Maxwell Smart 86,,89,8,277,False,476,I know these truths Stacey Abrams will lose th...,0.6486,positive
1,2022-11-03 13:23:45,Sort of like Stacey Abrams is still governor o...,[],[91882544],['DineshDSouza'],0,0,91882540.0,DineshDSouza,,...,Kevin Fodor,,717,100,1653,False,1079,Sort of like Stacey Abrams is still governor o...,0.3612,positive
2,2022-11-03 13:23:42,The Gargantuan Fundraising of Beto O'Rourke an...,[],[],[],0,1,,,,...,Jim Geraghty,"Authenticity Woods, Virginia.",1219,106991,16477,True,118131,The Gargantuan Fundraising of Beto O'Rourke an...,0.0,neutral
