In [1]:
#import necessary libraries 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertForSequenceClassification, BertTokenizer
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.functional import softmax
from google.cloud import storage 
import io
import tempfile
import shutil
from tqdm import tqdm
from transformers import pipeline

In [2]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_MODELS_BUCKET = 'models-lnt'
GCP_SOURCE_FILENAME = 'processed/vader_labeled_initial.csv'
MODEL_SPECIFICATION = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
OUTPUT_FILEPATH = 'processed/labeled.csv'
MODEL_DIR_FINETUNE = 'fine_tune_label'

HIGH_CONFIDENCE_THRESHOLD = 0.9
TEST_SIZE = 0.2
NUMBER_EPOCHS = 1
RANDOM_STATE = 215
ADAM_LEARNING_RATE = 1e-5
ADAM_BATCH_SIZE = 32
LABEL_BATCH_SIZE = 32
PATIENCE = 2

In [3]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [4]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Set the device to the first available GPU
    device = torch.device("cuda:0")
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

Using device: cuda:0


In [5]:
#import VADER labeled dataset into dataframe
df = pd.read_csv(io.StringIO(content))
df = df.dropna()
#Sanity check
df.head(-1)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text,vader_label
0,0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...,2
1,1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...,2
2,2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...,2
3,3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ...",2
4,4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...,2
...,...,...,...,...,...,...,...,...
42858,42859,Robert,Kennedy,D,GBN,20231010,"in america, see this story and i worry at the ...",2
42859,42860,Robert,Kennedy,D,GBN,20230702,"and j . edgar hoover, believe it or j. edgar h...",1
42860,42861,Robert,Kennedy,D,GBN,20230702,"and j . edgar hoover, believe it or j. edgar h...",1
42861,42862,Robert,Kennedy,D,CSPAN,20230720,he is more popular -- i hate to say this becau...,0


In [17]:
df_sample = df.sample(n=5000, random_state=215)

In [18]:
sentiment_analysis = pipeline("sentiment-analysis",
                              model="siebert/sentiment-roberta-large-english",
                             device=device)

In [19]:
text = df_sample['text'].to_list()

In [20]:
labels = sentiment_analysis(text)

In [25]:
sentiment = []
for label in labels:
    if label['label'] == 'NEGATIVE':
        sentiment.append(0)
    else:
        sentiment.append(1)

In [34]:
df_sample['BERT_label'] = sentiment

In [35]:
df_sample.head(-1)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text,vader_label,BRET_label,BERT_label
4799,4799,Donald,Trump,R,CNNW,20230923,we can remember the things we didn't know befo...,0,0,0
28279,28280,Chris,Christie,R,FOXNEWSW,20230606,. bill: tie entering the race today. sununu dr...,0,0,0
16255,16256,Ron,DeSantis,R,CNNW,20230830,"florida's governor, , who is urged people to e...",0,0,0
36051,36052,Joe,Biden,D,CSPAN,20230628,didn't. even mike pence did not have that rig...,1,0,0
29474,29475,Chris,Christie,R,FOXNEWSW,20230719,you've seen tie since he announced center his ...,0,0,0
...,...,...,...,...,...,...,...,...,...,...
37243,37244,Joe,Biden,D,FOXNEWSW,20231002,the money for ukraine that wanted. ukraine has...,0,0,0
11400,11401,Ron,DeSantis,R,KRON,20230603,campaigns says the gloves come off. it was gov...,2,1,1
8713,8713,Donald,Trump,R,FOXNEWSW,20230604,was not there. he did what -- he does his own ...,2,0,0
33936,33937,Joe,Biden,D,GBN,20230617,am i being unfair on ? mark gb unfair on ? mar...,0,0,0


In [41]:
df_sample[df_sample['last_name'] == 'Trump']['BERT_label'].value_counts()

BERT_label
0    766
1    384
Name: count, dtype: int64