In [1]:
import pandas as pd
import os
from google.cloud import storage
import io
from transformers import pipeline
import torch

In [2]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'raw/unlabeled.csv'
MODEL_SPECIFICATION = 'pszemraj/led-base-book-summary'
OUTPUT_FILEPATH = 'processed/summaries.csv'

In [3]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [4]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Set the device to the first available GPU
    device = torch.device("cuda:0")
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

Using device: cuda:0


In [5]:
def summarize(dataframe):
    #initialize summary dataframe
    summary_df = pd.DataFrame(columns=['first_name', 'last_name'])
    
    #get lists of candidate names and add to extract_df
    first_names = dataframe['first_name'].unique().tolist()
    last_names = dataframe['last_name'].unique().tolist()
    summary_df['first_name'] = first_names
    summary_df['last_name'] = last_names
    
    #initalize empty list to store candidate text
    candidate_text = []
    
    for first, last in zip(first_names, last_names):
        #get up to 50 random mentions of candidate
        candidate_df = df[df['last_name'] == last]
        try:
            candidate_df = candidate_df.sample(n=50)
        except:
            pass
        
        #get mentions from random sample
        mentions = candidate_df['text'].tolist()
        
        #convert mentions list to one string
        text = ' '.join(mentions)
        
        #append text to candidate_text list
        candidate_text.append(text)

    #create summaries
    summaries = summarizer(candidate_text,
                           min_length=128,
                           max_length=256,
                           no_repeat_ngram_size=3,
                           encoder_no_repeat_ngram_size=3,
                           repetition_penalty=3.5,
                           num_beams=4,
                           do_sample=False,
                           early_stopping=True)
    
    #init list to stores summaries and append summary for each candidate
    candidate_summaries = []
    for summary in summaries:
        candidate_summaries.append(summary["summary_text"])

    #append name, party, summary to df
    summary_df['first_name'] = first_names
    summary_df['last_name'] = last_names
    summary_df['summary'] = candidate_summaries
    
    return summary_df
    

In [6]:
def save_dataset(df, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    #convert DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    #upload the CSV string to GCP
    blob = bucket.blob(outfilepath)
    blob.upload_from_string(csv_string)
    

In [7]:
#import unlabeled dataset into dataframe
df = pd.read_csv(io.StringIO(content))
df = df.dropna()
#Sanity check
df.head(5)

Unnamed: 0,first_name,last_name,party,network,date,text
0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...
1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...
2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...
3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ..."
4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...


In [8]:
#define summarization pipeline
summarizer = pipeline("summarization", model=MODEL_SPECIFICATION, device=device)

In [9]:
#summarize the dataframe
summary_df = summarize(df)

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 14.58 GiB total capacity; 774.44 MiB already allocated; 23.38 MiB free; 834.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
#check output
summary_df['summary'][2]

In [None]:
#save the output dataframe
save_dataset(summary_df, OUTPUT_FILEPATH)