In [47]:
import pandas as pd
from transformers import pipeline
import gensim.downloader as api
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Functions 

In [36]:
def get_vectors(lst_keywords, wv):
    
    # Creating the vectorizer 
    vectorizer = CountVectorizer(stop_words='english')

    # Fit the model with our data (each keyword becomes a feature, some are split)
    X = vectorizer.fit_transform(lst_keywords)

    # Make an array and fills it in
    CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

    # Words in the vocabulary (some keywords are split)
    WordsVocab=CountVectorizedData.columns

    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()

    # Looping through each row for the data
    for i in range(CountVectorizedData.shape[0]):

        # initiating a sentence with all zeros
        sentence = np.zeros(300)

        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVectorizedData.iloc[i , :] >= 1]:
            if word in wv.index_to_key:
                sentence = sentence + wv[word] 
        # Appending the sentence to the dataframe
        W2Vec_Data = W2Vec_Data.append(pd.DataFrame([sentence]))

    return W2Vec_Data

## Load data and models

In [2]:
# Load DW data
df_dw = pd.read_json('../data/interim/clean_keywords_2022-01-01_2023-01-01.json', orient ='split', compression = 'infer')

# Load Google data
df_google = pd.read_json('../data/interim/2022-daily-trending-searches.json', orient ='split', compression = 'infer')

In [3]:
# Runs for 1min20
# Load pre-trained zero shot learning model
pipe = pipeline(model="facebook/bart-large-mnli") 

# Loads word2vec google model
wv = api.load('word2vec-google-news-300')

## Match datasets in time

In [4]:
# Extract date range in data
start_date_dw = pd.to_datetime(df_dw['Date']).min()
end_date_dw = pd.to_datetime(df_dw['Date']).max()

# Remove rows witn no category
df_dw.dropna(subset=['cleanFocusCategory'], inplace = True)
df_dw.reset_index(drop = True, inplace = True)

# Keeps only google data within DW data date range
df_google.sort_values(by ='date', inplace = True) 
mask = (pd.to_datetime(df_google['date']) > start_date_dw) & (pd.to_datetime(df_google['date']) <= end_date_dw)
df_google_subset = df_google.loc[mask].copy()

## Zero-shot learning model

In [5]:
# Target variable (category)
focus_category_list = list(set(df_dw['cleanFocusCategory']))
print(len(focus_category_list), 'categories: ', focus_category_list)

24 categories:  ['Migration', 'Health', 'Travel', 'Law and Justice', 'Learning German', 'Digital World', 'Cars and Transportation', 'Media', 'Business', 'Sports', 'Nature and Environment', 'Offbeat', 'Education', 'Human Rights', 'Culture', 'Technology', 'Politics', 'History', 'Society', 'Lifestyle', 'Catastrophe', 'Science', 'Innovation', 'Religion']


In [321]:
# Replace 'Topic' by None and remove them for now (TODO)
df_google_subset['topic_type'].replace({'Topic': None}, inplace = True)
df_google_subset.dropna(subset=['topic_type'], inplace = True)

# Make a new column combining those 2
df_google_subset['topic_title_type'] = df_google_subset['topic_title'] + ', ' + df_google_subset['topic_type']

In [462]:
load = 1

if load: 
    df_prediction = pd.read_json('../data/interim/zero_shot_prediction_google_keywords_2022-01-01_2023-01-01.json', orient ='split', compression = 'infer')
else:
    # Runs for 14min (for 100 google keyword lines)
    df_prediction = df_google_subset.iloc[0:100][['topic_title', 'topic_type', 'topic_title_type']].copy()

    # Runs the model
    category_outputs = [pipe(kw, candidate_labels = focus_category_list) for kw in df_prediction['topic_title_type'].to_list()]
    labels = list(map(lambda x: x['labels'][0], category_outputs))

    # Add a column to dataframe
    df_prediction['Predicted category'] = labels
    df_prediction.to_json('../data/interim/zero_shot_prediction_google_keywords_2022-01-01_2023-01-01.json', orient = 'split', compression = 'infer', index = 'true')

## Assess performance: Compare to similar DW keyword - category pairs

In [464]:
# Vectorisation of DW keywords
load = 1

if load: 
    vec_keywords_dw = pd.read_json('../data/interim/vectorised_clean_keywords_2022-01-01_2023-01-01.json', orient ='split', compression = 'infer')
else:
    # runs during 10 min
    lst_keywords_dw = df_dw['keywordStringsCleanAfterFuzz'].astype(str)
    vec_keywords_dw = get_vectors(lst_keywords_dw, wv)
    vec_keywords_dw.to_json('../data/interim/vectorised_clean_keywords_2022-01-01_2023-01-01.json', orient = 'split', compression = 'infer', index = 'true')

In [465]:
# Vectorisation of DW category
category_dw = focus_category_list
vec_category_dw = get_vectors(category_dw, wv)

In [480]:
## Google topic on DW keywords

# Vectorisation of Google keywords
lst_keywords_ggl = df_prediction['topic_type'].astype(str) # topic_type or topic_title or topic_title_type (best predictor: topic_type)
vec_keywords_ggl = get_vectors(lst_keywords_ggl, wv)

# Compute distances: google keywords vs DW keywords 
ggl_dw_word_distances = cosine_similarity(vec_keywords_ggl, vec_keywords_dw) 

# Get indices of similar (above threshold) DW keywords
distance_threshold = 0.5
ind_ggl_to_dw = [[ind for ind in range(len(ggl_word_list)) if ggl_word_list[ind] > distance_threshold] for ggl_word_list in ggl_dw_word_distances]

# Extract most frequet category for similar DW keywords
most_freq_categories = [df_dw['cleanFocusCategory'][lst_keywords_dw.index[ind_ggl_to_dw[i]]].mode()[0] if len(ind_ggl_to_dw[i]) > 0 else None for i in range(len(ind_ggl_to_dw))]

# Make a new column in df
df_prediction['Category of similar DW keywords'] = most_freq_categories

In [481]:
## Google topic on DW categories 

# Vectorisation of Google keywords
lst_keywords_ggl = df_prediction['topic_type'].astype(str) # topic_type or topic_title or topic_title_type (best predictor: topic_type)
vec_keywords_ggl = get_vectors(lst_keywords_ggl, wv)

# Compute distances: google keywords vs DW categories
ggl_dw_categ_distances = cosine_similarity(vec_keywords_ggl, vec_category_dw) 

# Get indices of similar (above threshold) DW keywords
closest_categ = [[category_dw[dist.argmax()], round(dist.max(),2)] for dist in ggl_dw_categ_distances]

# Make a new column in df
distance_threshold = 0.2
df_prediction['Most similar DW category'] = [row[0] if row[1] > distance_threshold else None for row in closest_categ]

In [482]:
accuracy_sim_dw_keywords = sum(df_prediction['Predicted category'] == df_prediction['Category of similar DW keywords']) / len(df_prediction)
accuracy_sim_dw_categ = sum(df_prediction['Predicted category'] == df_prediction['Most similar DW category']) / len(df_prediction)
print('Accuracy:')
print('Compared to category of similar keywords:', accuracy_sim_dw_keywords) 
print('Compared to similar categories:', accuracy_sim_dw_categ)

Accuracy:
Compared to category of similar keywords: 0.67
Compared to similar categories: 0.7


In [469]:
df_prediction

Unnamed: 0,topic_title,topic_type,topic_title_type,Predicted category,Category of similar DW keywords,Most similar DW category
35,National Football League Playoffs,Championship,"National Football League Playoffs, Championship",Sports,Sports,Sports
42,Yellowstone,Drama series,"Yellowstone, Drama series",Media,,Offbeat
41,Secondary school,School category,"Secondary school, School category",Education,,Education
40,Week,Unit of time,"Week, Unit of time",Offbeat,,History
39,Tampa,City in Florida,"Tampa, City in Florida",Offbeat,Politics,Learning German
...,...,...,...,...,...,...
194,Pittsburgh Steelers,American football team,"Pittsburgh Steelers, American football team",Sports,Sports,Sports
200,National Football League Playoffs,Championship,"National Football League Playoffs, Championship",Sports,Sports,Sports
198,NFL,League,"NFL, League",Sports,Sports,Sports
196,Super Bowl,Championship,"Super Bowl, Championship",Sports,Sports,Sports
