In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os
import io
from google.cloud import storage

In [2]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'raw/unlabeled.csv'
OUTPUT_FILEPATH = 'processed/vader_labeled_initial.csv'

In [3]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [4]:
def label(dataframe):
    """
    Uses NLTK's VADER to evaluate the unlabeled dataset. Labels are added 
    to the dataframe based on the label provided by the model.

    Input: tokenized_texts, model, device, dataframe, batch_size
    Output: None
    """  
    #define sentiment analyzer from NLTK Vader
    analyzer = SentimentIntensityAnalyzer()

    #get all text from dataframe
    mentions = dataframe['text'].tolist()

    #define list to store all labels
    labels = []

    #loop through all mentions
    for mention in mentions:

        #Evaluate sentiment of text
        sentiment_scores = analyzer.polarity_scores(mention)
        
        #use compound score to determine final label (negative, neutral, positive)
        compound_score = sentiment_scores['compound']

        if compound_score <= -0.05:
            label = 0
        elif compound_score >= 0.05:
            label = 2
        else:
            label = 1

        #write label to full label list
        labels.append(label)

    dataframe['vader_label'] = labels

    return dataframe

In [5]:
def save_dataset(dataframe, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    #convert DataFrame to a CSV string
    csv_string = dataframe.to_csv(index=False)

    #upload the CSV string to GCP
    blob = bucket.blob(outfilepath)
    blob.upload_from_string(csv_string)

In [6]:
#import unlabeled dataset into dataframe
df = pd.read_csv(io.StringIO(content))
df = df.dropna()
#Sanity check
df.head(5)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text
0,0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...
1,1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...
2,2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...
3,3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ..."
4,4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...


In [7]:
#label the dataset
df = label(df)

In [8]:
#sanity check
df.head(-1)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text,vader_label
0,0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...,2
1,1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...,2
2,2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...,2
3,3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ...",2
4,4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...,2
...,...,...,...,...,...,...,...,...
42859,42859,Robert,Kennedy,D,GBN,20231010,"in america, see this story and i worry at the ...",2
42860,42860,Robert,Kennedy,D,GBN,20230702,"and j . edgar hoover, believe it or j. edgar h...",1
42861,42861,Robert,Kennedy,D,GBN,20230702,"and j . edgar hoover, believe it or j. edgar h...",1
42862,42862,Robert,Kennedy,D,CSPAN,20230720,he is more popular -- i hate to say this becau...,0


In [10]:
#Print label counts
df['vader_label'].value_counts()

vader_label
2    24358
0    15172
1     3334
Name: count, dtype: int64

In [11]:
#save the dataframe to GCP
save_dataset(df, OUTPUT_FILEPATH)