In [1]:
import pandas as pd
import os
from google.cloud import storage
import io
from transformers import pipeline
import torch

In [2]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'raw/unlabeled.csv'
MODEL_SPECIFICATION = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
OUTPUT_FILEPATH = 'processed/summaries.csv'

In [3]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [4]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Set the device to the first available GPU
    device = torch.device("cuda:0")
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

Using device: cuda:0


In [22]:
def summarize(dataframe):
    #initialize summary dataframe
    summary_df = pd.DataFrame(columns=['first_name', 'last_name', 'party', 'summary'])
    
    #get list of first and last names of candidates
    first_names = dataframe['first_name'].tolist()
    last_names = dataframe['last_name'].tolist()
    
    for first, last in zip(first_names, last_names):
        #get all mentions for candidate
        candidate_df = df[df['last_name'] == last]
        mentions = candidate_df['text'].tolist()
        
        #convert mentions list to one string
        text = ' '.join(mentions)
        
        #create summary
        summary = summarizer(text, max_length=300, min_length=150, do_sample=False)
        summary = summary[0]["summary_text"]
        
        #get candidate party
        party = candidate_df['party'].tolist()[0]
        
        #append row to summary_df
        summary_df.loc[len(summary_df.index)] = [first, last, party, summary] 
    
    return summary_df
    

In [14]:
def save_dataset(df, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    #convert DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    #upload the CSV string to GCP
    blob = bucket.blob(outfilepath)
    blob.upload_from_string(csv_string)
    

In [15]:
#import unlabeled dataset into dataframe
df = pd.read_csv(io.StringIO(content))
df = df.dropna()
#Sanity check
df.head(5)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text
0,0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...
1,1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...
2,2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...
3,3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ..."
4,4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...


In [16]:
#define summarization pipeline
summarizer = pipeline("summarization", model="pszemraj/led-base-book-summary", device=device)

In [17]:
#test df
test_df = df[df['last_name'].isin(['Williamson', 'Kennedy'])]

In [23]:
#summarize the dataframe
summary_df = summarize(test_df)



KeyboardInterrupt: 