In [1]:
import pandas as pd
import os
from google.cloud import storage
import io
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np
import torch

In [2]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'raw/unlabeled.csv'
MODEL_SPECIFICATION = 'ml6team/keyphrase-extraction-kbir-inspec'
OUTPUT_FILEPATH = 'processed/keywords.csv'

In [3]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [4]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Set the device to the first available GPU
    device = torch.device("cuda:0")
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

Using device: cuda:0


In [5]:
# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])

In [7]:
def extract(dataframe):
    #initialize summary dataframe
    extract_df = pd.DataFrame(columns=['first_name', 'last_name'])
    
    #get lists of candidate names and add to extract_df
    first_names = dataframe['first_name'].unique().tolist()
    last_names = dataframe['last_name'].unique().tolist()
    extract_df['first_name'] = first_names
    extract_df['last_name'] = last_names
    
    #initalize empty list to store candidate text
    candidate_text = []
    
    for first, last in zip(first_names, last_names):
        #get up to 100 random mentions of candidate
        candidate_df = df[df['last_name'] == last]
        try:
            candidate_df = candidate_df.sample(n=1000)
        except:
            pass
        
        #get mentions from random sample
        mentions = candidate_df['text'].tolist()
        
        #convert mentions list to one string
        text = ' '.join(mentions)
        
        #append text to candidate_text list
        candidate_text.append(text)

    #create key words
    extraction = extractor(candidate_text)
    
    #init list to stores summaries and append summary for each candidate
    candidate_key_words = []
    for key_words in extraction:
        candidate_key_words.append(list(key_words))

    #append name, party, summary to df
    extract_df['first_name'] = first_names
    extract_df['last_name'] = last_names
    extract_df['key_words'] = candidate_key_words
    
    return extract_df

In [8]:
def save_dataset(df, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    #convert DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    #upload the CSV string to GCP
    blob = bucket.blob(outfilepath)
    blob.upload_from_string(csv_string)
    

In [9]:
#import unlabeled dataset into dataframe
df = pd.read_csv(io.StringIO(content))
df = df.dropna()
#Sanity check
df.head(5)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text
0,0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...
1,1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...
2,2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...
3,3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ..."
4,4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...


In [10]:
#define summarization pipeline
extractor = KeyphraseExtractionPipeline(model=MODEL_SPECIFICATION, device=device)

In [11]:
#summarize the dataframe
extract_df = extract(df)

In [12]:
#sanity check
extract_df.head(-1)

Unnamed: 0,first_name,last_name,key_words
0,Marianne,Williamson,"[corruption, democratic party, fair, journalis..."
1,Donald,Trump,"[freedom of speech, rigged elections, un secur..."
2,Ron,DeSantis,"[immigration sanctuary laws, murder states, na..."
3,Nikki,Haley,"[border security, united nations]"
4,Vivek,Ramaswamy,[]
5,Mike,Pence,"[e, fake elector scheme, fulton county prosecu..."
6,Tim,Scott,"[former vice president, republican primary deb..."
7,Chris,Christie,"[republican national committee, united states]"
8,Doug,Burgum,"[education system, national security, republic..."
9,Asa,Hutchinson,"[14th amendment, arkansas governor, biotech en..."


In [13]:
#save the output dataframe
save_dataset(extract_df, OUTPUT_FILEPATH)