# Data Preparation for NLP Analysis

In [1]:
# Import packages 
import pandas as pd
import numpy as np 
import re

## Read in Data

In [2]:
# Read in data 
data = pd.read_csv('./Data/all_data.csv')

# Understand output of data
data.head(2)

Unnamed: 0.1,Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,...,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic_y
0,0,2022-10-18 00:00:00,Sharp words on guns in Shane Hazel to Stacey A...,['gagovdebate'],[],[],5,24,,,...,25282846,SimonesNews,Simone Sebastian,Washington DC,3110,5830,1445,True,4400,abrams
1,1,2022-10-18 00:00:01,Stacey Abrams won tonight. She kept to the fac...,[],[],[],0,6,,,...,1312393604439183361,nching0,Thee Lost Edges of Candace 🪥,"34.2073° N, 84.1402° W",922,752,101529,False,61963,abrams


### Basic Data Quality Checks 

In [3]:
# Check if there are any rows where with no text data
print('Number of Rows with No Text Data:', data['text'].isnull().values.any().sum())

Number of Rows with No Text Data: 0


In [4]:
# Get number of Tweets in our corpus 
print('Number of Tweets:', len(data['text']))

Number of Tweets: 51336


From the basic data quality check we see that every Tweet returned does contain text. With this information, we can now move onto the text cleaning and pre-processing stages. 

## Text Preprocessing 

We will now clean the text by removing excess punctuation, spaces, and special characters. 

In [5]:
# regex statement for cleaning 
replace = [
    (r"(?<=\d),(?=\d)", ""),        # Remove commas in numbers
    (r"\d+", "number"),             # Map digits to special token <numbr>
    (r"[\t\n\r\*\.\@\,\-\/]", " "), # Punctuation and other junk
    (r"\s+", " ")                   # Stips extra whitespace
]

# looping through all Tweets and applying regex cleaning 
train_sentences = []
for i, d in enumerate(data['text']):
    for repl in replace:
        d = re.sub(repl[0], repl[1], d)
    train_sentences.append(d)

# writing output of regex cleaning to df column 
data['cleaned_text'] = train_sentences

In [6]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,...,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic_y,cleaned_text
0,0,2022-10-18 00:00:00,Sharp words on guns in Shane Hazel to Stacey A...,['gagovdebate'],[],[],5,24,,,...,SimonesNews,Simone Sebastian,Washington DC,3110,5830,1445,True,4400,abrams,Sharp words on guns in Shane Hazel to Stacey A...
1,1,2022-10-18 00:00:01,Stacey Abrams won tonight. She kept to the fac...,[],[],[],0,6,,,...,nching0,Thee Lost Edges of Candace 🪥,"34.2073° N, 84.1402° W",922,752,101529,False,61963,abrams,Stacey Abrams won tonight She kept to the fact...


We now have a data frame with a new column `cleaned_text` that contains the cleaned version of each Tweet. 

## Understanding the Cleaned Tweets 

### VADER Sentiment Analysis

In [7]:
# loading in sentiment libraries 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [8]:
# loading VADER sentiment model 
vader_sentiment = SentimentIntensityAnalyzer()

# function that returns sentiment score for a series of text 
def vader_sentiment_scores(text):
  score = vader_sentiment.polarity_scores(text)
  
  return score['compound']

In [9]:
# function that bins sentiment into positive, negative, and neutral categories based on sentiment score 
def format_output(row):
  polarity = "neutral"
  if(row>= 0.05):
    polarity = "positive"
  elif(row<= -0.05):
    polarity = "negative"

  return polarity

In [10]:
# getting sentiment scores for corpus of text 
data['sentiment_score'] = data['text'].apply(vader_sentiment_scores)

# getting sentiment bins for our data 
data['sentiment_bin'] = data['sentiment_score'].apply(format_output)

In [11]:
# examining data frame with sentiment scores and bins
data.head(3)

Unnamed: 0.1,Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,...,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic_y,cleaned_text,sentiment_score,sentiment_bin
0,0,2022-10-18 00:00:00,Sharp words on guns in Shane Hazel to Stacey A...,['gagovdebate'],[],[],5,24,,,...,Washington DC,3110,5830,1445,True,4400,abrams,Sharp words on guns in Shane Hazel to Stacey A...,0.3818,positive
1,1,2022-10-18 00:00:01,Stacey Abrams won tonight. She kept to the fac...,[],[],[],0,6,,,...,"34.2073° N, 84.1402° W",922,752,101529,False,61963,abrams,Stacey Abrams won tonight She kept to the fact...,0.7351,positive
2,2,2022-10-18 00:00:01,"Why did Joe Rogan send his little brother, Sha...",['GAGovDebate'],[],[],0,5,,,...,Seattle,17762,25727,82402,False,43808,abrams,Why did Joe Rogan send his little brother Shan...,-0.25,negative


In [None]:
#!pip install pysentimiento

In [18]:
### THIS TAKES A LONG TIME TO RUN 

from pysentimiento import create_analyzer

# loading transformer sentiment model 
analyzer = create_analyzer(task="sentiment", lang="en")

# loading transformer emotion model 
emotion_analyzer = create_analyzer(task="emotion", lang="en")

# loading transformer hate speech model 
hate_speech_analyzer = create_analyzer(task="hate_speech", lang="en")


Downloading:   0%|          | 0.00/999 [00:00<?, ?B/s]

loading configuration file config.json from cache at /Users/monroefarris/.cache/huggingface/hub/models--finiteautomata--bertweet-base-emotion-analysis/snapshots/64046df9cc41eab40e1ecde7d2b7fb42b971be5b/config.json
Model config RobertaConfig {
  "_name_or_path": "finiteautomata/bertweet-base-emotion-analysis",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "others",
    "1": "joy",
    "2": "sadness",
    "3": "anger",
    "4": "surprise",
    "5": "disgust",
    "6": "fear"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "anger": 3,
    "disgust": 5,
    "fear": 6,
    "joy": 1,
    "others": 0,
    "sadness": 2,
    "surprise": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 

Downloading:   0%|          | 0.00/540M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [15]:
# getting output from transformer sentiment model 
def enhanced_sentiment_scores(text):
    score = analyzer.predict(text) 
    return score

# getting output from transformer emotion model 
def emotion_scores(text):
    score = emotion_analyzer.predict(text) 
    return score

# getting output from transformer hate speech model 
def hate_speech_scores(text):
    score = hate_speech_analyzer.predict(text) 
    return score

In [16]:
### THIS TAKES A LONG TIME TO RUN 

data['pysentimiento_sentiment'] = data['text'].apply(enhanced_sentiment_scores)

data['pysentimiento_emotion'] = data['text'].apply(emotion_scores)

data['pysentimiento_hate'] = data['text'].apply(hate_speech_scores)

In [17]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,...,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic_y,cleaned_text,sentiment_score,sentiment_bin,pysentimiento_output
0,0,2022-10-18 00:00:00,Sharp words on guns in Shane Hazel to Stacey A...,['gagovdebate'],[],[],5,24,,,...,3110,5830,1445,True,4400,abrams,Sharp words on guns in Shane Hazel to Stacey A...,0.3818,positive,"AnalyzerOutput(output=NEU, probas={NEU: 0.851,..."
1,1,2022-10-18 00:00:01,Stacey Abrams won tonight. She kept to the fac...,[],[],[],0,6,,,...,922,752,101529,False,61963,abrams,Stacey Abrams won tonight She kept to the fact...,0.7351,positive,"AnalyzerOutput(output=NEU, probas={NEU: 0.675,..."
2,2,2022-10-18 00:00:01,"Why did Joe Rogan send his little brother, Sha...",['GAGovDebate'],[],[],0,5,,,...,17762,25727,82402,False,43808,abrams,Why did Joe Rogan send his little brother Shan...,-0.25,negative,"AnalyzerOutput(output=NEG, probas={NEG: 0.637,..."
3,3,2022-10-18 00:00:08,Viral handbag designer and EBONY Power100 Styl...,"['StaceyAbrams', 'BrandonBlackwood', 'EBONYMag']",[],[],1,8,,,...,2334,445954,4403,True,91289,abrams,Viral handbag designer and EBONY Powernumber S...,0.7184,positive,"AnalyzerOutput(output=NEU, probas={NEU: 0.793,..."
4,4,2022-10-18 00:00:11,THE MOST DANGEROUS THING FACING GEORGIA IS 4 M...,[],[],[],212,528,,,...,1990,9076,42697,False,9657,kemp,THE MOST DANGEROUS THING FACING GEORGIA IS num...,0.1776,positive,"AnalyzerOutput(output=NEG, probas={NEG: 0.943,..."
