In [128]:
import pandas as pd
from transformers import pipeline

## Load data and models

In [47]:
# Load DW data
df_dw = pd.read_json('../data/interim/clean_keywords_2022-01-01_2023-01-01.json', orient ='split', compression = 'infer')

# Load Google data
df_google = pd.read_json('../data/interim/2022-daily-trending-searches.json', orient ='split', compression = 'infer')

# Load pre-trained zero shot learning model
pipe = pipeline(model="facebook/bart-large-mnli") 

## Match datasets in time

In [192]:
# Extract date range in data
start_date_dw = pd.to_datetime(df_dw['Date']).min()
end_date_dw = pd.to_datetime(df_dw['Date']).max()

# Remove rows witn no category
df_dw.dropna(subset=['cleanFocusCategory'], inplace = True)
df_dw.reset_index(drop = True, inplace = True)

# Keeps only google data within DW data date range
df_google.sort_values(by ='date', inplace = True) 
mask = (pd.to_datetime(df_google['date']) > start_date_dw) & (pd.to_datetime(df_google['date']) <= end_date_dw)
df_google_subset = df_google.loc[mask].copy()

## Zero-shot learning model

In [93]:
# Target variable (category)
focus_category_list = list(set(df_dw['cleanFocusCategory']))
print(len(focus_category_list), 'categories: ', focus_category_list)

24 categories:  ['Politics', 'Technology', 'Health', 'Learning German', 'Digital World', 'Culture', 'Nature and Environment', 'Business', 'Religion', 'Sports', 'Education', 'Migration', 'Catastrophe', 'History', 'Lifestyle', 'Media', 'Society', 'Cars and Transportation', 'Innovation', 'Offbeat', 'Human Rights', 'Science', 'Travel', 'Law and Justice']


In [101]:
# Replace 'Topic' by None and remove them for now (TODO)
df_google_subset['topic_type'].replace({'Topic': None}, inplace = True)
df_google_subset.dropna(subset=['topic_type'], inplace = True)

# Make a new column combining those 2
df_google_subset['topic_title_type'] = df_google_subset['topic_title'] + ', ' + df_google_subset['topic_type']

In [136]:
# Run the model - 1min for 5
google_keywords_category = df_google_subset['topic_title_type'].to_list()[0:5]
google_keywords = df_google_subset['topic_title'].to_list()[0:5]
category_outputs = [pipe(kw, candidate_labels = focus_category_list) for kw in google_keywords_category]
labels = list(map(lambda x: x['labels'][0], category_outputs))

In [137]:
df_prediction = pd.DataFrame(list(zip(google_keywords, google_keywords_category, labels)), columns =['Google keywords', 'Google keyword category', 'Predicted category'])
df_prediction

Unnamed: 0,Google keywords,Google keyword category,Predicted category
0,National Football League Playoffs,"National Football League Playoffs, Championship",Sports
1,Secondary school,"Secondary school, School category",Education
2,Ole Miss Rebels football,"Ole Miss Rebels football, Football team",Sports
3,Yellowstone,"Yellowstone, Drama series",Media
4,Week,"Week, Unit of time",Offbeat


In [190]:
# TODO: finding closest word to Google keywords	(e.g., National Football League Playoffs) in DF