In [2]:
import pandas as pd
from transformers import pipeline
import gensim.downloader as api
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

2023-04-03 11:34:00.191186: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-03 11:34:01.208858: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-03 11:34:01.208880: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-03 11:34:04.436212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

## Functions 

In [3]:
def get_vectors(lst_keywords, wv):
    
    # Creating the vectorizer 
    vectorizer = CountVectorizer(stop_words='english')

    # Fit the model with our data (each keyword becomes a feature, some are split)
    X = vectorizer.fit_transform(lst_keywords)

    # Make an array and fills it in
    CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

    # Words in the vocabulary (some keywords are split)
    WordsVocab=CountVectorizedData.columns

    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()

    # Looping through each row for the data
    for i in range(CountVectorizedData.shape[0]):

        # initiating a sentence with all zeros
        sentence = np.zeros(300)

        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVectorizedData.iloc[i , :] >= 1]:
            if word in wv.index_to_key:
                sentence = sentence + wv[word] 
        # Appending the sentence to the dataframe
        W2Vec_Data = W2Vec_Data.append(pd.DataFrame([sentence]))

    return W2Vec_Data

## Load data and models

In [3]:
# Load DW data
df_dw = pd.read_json('../data/interim/clean_keywords_2022-01-01_2023-01-01.json', orient ='split', compression = 'infer')

# Load Google data
df_google = pd.read_json('../data/interim/2022-daily-trending-searches.json', orient ='split', compression = 'infer')

In [4]:
df_google = pd.read_json('../data/interim/2022-daily-trending-searches.json', orient ='split', compression = 'infer')

In [5]:
df_google


Unnamed: 0,value,formattedValue,link,topic_mid,topic_title,topic_type,date,location
0,3550,"+3,550%",/trends/explore?q=/m/015cqh&date=2022-01-01+20...,/m/015cqh,Journey,Band,2022-01-01,US
1,1350,"+1,350%",/trends/explore?q=/m/0bdxs5&date=2022-01-01+20...,/m/0bdxs5,Miley Cyrus,American singer-songwriter,2022-01-01,US
2,1000,"+1,000%",/trends/explore?q=/m/02cgn0&date=2022-01-01+20...,/m/02cgn0,Countdown,Topic,2022-01-01,US
3,1000,"+1,000%",/trends/explore?q=/m/01wqvm&date=2022-01-01+20...,/m/01wqvm,Rose Parade,Topic,2022-01-01,US
4,950,+950%,/trends/explore?q=/g/11n7k56n1w&date=2022-01-0...,/g/11n7k56n1w,2021 Times Square Ball Drop,Event,2022-01-01,US
...,...,...,...,...,...,...,...,...
4217,100,+100%,/trends/explore?q=/m/0jm_&date=2022-12-31+2022...,/m/0jm_,American football,Sports,2022-12-31,US
4218,80,+80%,/trends/explore?q=/g/11b77qrp3l&date=2022-12-3...,/g/11b77qrp3l,2022,Topic,2022-12-31,US
4219,70,+70%,/trends/explore?q=/m/02hnnn&date=2022-12-31+20...,/m/02hnnn,Bowl game,Topic,2022-12-31,US
4220,60,+60%,/trends/explore?q=/m/01mtb&date=2022-12-31+202...,/m/01mtb,Cooking,Topic,2022-12-31,US


In [5]:
# Runs for 1min20
# Load pre-trained zero shot learning model
pipe = pipeline(model="facebook/bart-large-mnli") 

# Loads word2vec google model
wv = api.load('word2vec-google-news-300')

## Match datasets in time

In [6]:
# Extract date range in data
start_date_dw = pd.to_datetime(df_dw['Date']).min()
end_date_dw = pd.to_datetime(df_dw['Date']).max()

# Remove rows witn no category
df_dw.dropna(subset=['cleanFocusCategory'], inplace = True)
df_dw.reset_index(drop = True, inplace = True)

# Keeps only google data within DW data date range
df_google.sort_values(by ='date', inplace = True) 
mask = (pd.to_datetime(df_google['date']) > start_date_dw) & (pd.to_datetime(df_google['date']) <= end_date_dw)
df_google_subset = df_google.loc[mask].copy()

## Zero-shot learning model

In [7]:
# Target variable (category)
focus_category_list = list(set(df_dw['cleanFocusCategory']))
print(len(focus_category_list), 'categories: ', focus_category_list)

24 categories:  ['Catastrophe', 'Society', 'Education', 'Innovation', 'Health', 'Science', 'Offbeat', 'Business', 'Media', 'Cars and Transportation', 'Technology', 'Travel', 'Digital World', 'Sports', 'Learning German', 'History', 'Law and Justice', 'Lifestyle', 'Religion', 'Culture', 'Migration', 'Human Rights', 'Politics', 'Nature and Environment']


In [8]:
# Replace 'Topic' by None and remove them for now (TODO)
df_google_subset['topic_type'].replace({'Topic': None}, inplace = True)
df_google_subset.dropna(subset=['topic_type'], inplace = True)

# Make a new column combining those 2
df_google_subset['topic_title_type'] = df_google_subset['topic_title'] + ', ' + df_google_subset['topic_type']

In [488]:
# On Google keywords 
load = 1

if load: 
    df_prediction = pd.read_json('../data/interim/zero_shot_prediction_google_keywords_2022-01-01_2023-01-01_all.json', orient ='split', compression = 'infer')
else:
    # Runs for 14min (for 100 google keyword lines)
    df_prediction = df_google_subset[['topic_title', 'topic_type', 'topic_title_type']].copy()

    # Runs the model
    category_outputs = [pipe(kw, candidate_labels = focus_category_list) for kw in df_prediction['topic_title_type'].to_list()]
    labels = list(map(lambda x: x['labels'][0], category_outputs))

    # Add a column to dataframe
    df_prediction['Predicted category'] = labels
    df_prediction.to_json('../data/interim/zero_shot_prediction_google_keywords_2022-01-01_2023-01-01.json', orient = 'split', compression = 'infer', index = 'true')

In [56]:
# On DW keywords (to assess accuracy)
load = 0

if load: 
    df_dw_prediction = pd.read_json('../data/interim/zero_shot_prediction_dw_keywords_2022-01-01_2023-01-01_200.json', orient ='split', compression = 'infer')
else:
    # 1min for 4 words
    # 55min for 200 articles
    df_dw_prediction = df_dw[0:200].copy()

    # Runs the model
    category_outputs = [pipe(', '.join(kw), candidate_labels = focus_category_list) for kw in df_dw_prediction['keywordStringsCleanAfterFuzz']]
    labels = list(map(lambda x: x['labels'][0], category_outputs))

    # Add a column to dataframe
    df_dw_prediction['Predicted category'] = labels
    df_dw_prediction.to_json('../data/interim/zero_shot_prediction_dw_keywords_2022-01-01_2023-01-01_200.json', orient = 'split', compression = 'infer', index = 'true')

In [62]:
df_dw_prediction

Unnamed: 0,id,lastModifiedDate,Date,keywordStrings,keywordStringsCleanAfterFuzz,cleanFocusCategory,Predicted category
0,60306089,2022-01-01T02:35:51.098Z,2022-01-01,"[Belgium, explosion, apartment building]","[belgium, explosion, apartment building]",Catastrophe,Catastrophe
1,60305852,2022-01-01T03:10:21.161Z,2022-01-01,"[Srebrenica genocide, Bosnian War, Yugoslav Wa...","[srebrenica genocide, bosnian war, yugoslav wa...",Politics,Catastrophe
2,60084276,2022-01-01T07:19:32.181Z,2022-01-01,"[sport, football, Qatar 2022, World Cup, FIFA ...","[sports, football, qatar 2022, world cup, fifa...",Sports,Sports
3,60223754,2022-01-01T10:27:58.617Z,2022-01-01,"[UN, OCHA, aid, Yemen, Afghanistan, Martin Gri...","[un, ocha, aid, yemen, afghanistan, martin gri...",Society,Offbeat
4,60306645,2022-01-01T12:28:59.500Z,2022-01-01,"[irish language, European Union, official eu l...","[irish language, european union, official eu l...",Society,Media
...,...,...,...,...,...,...,...
195,60359581,2022-01-07T16:37:57.195Z,2022-01-07,"[female executives, executive boards, DAX, gen...","[female executives, executive boards, dax, gen...",Business,Business
196,60361354,2022-01-07T16:59:27.766Z,2022-01-07,"[Coronavirus, COVID-19, Austria, Karl Nehammer]","[coronavirus, covid-19, austria, karl nehammer]",Health,Health
197,60360285,2022-01-07T17:16:27.808Z,2022-01-07,"[Russia, Kazakhstan, Almaty, Moscow, unrest, p...","[russia, kazakhstan, almaty, moscow, unrest, p...",Politics,Offbeat
198,60361419,2022-01-07T17:47:16.685Z,2022-01-07,"[US special forces, Albania]","[us special forces, albania]",Politics,Offbeat


In [61]:
df_dw_prediction[['keywordStringsCleanAfterFuzz', 'cleanFocusCategory', 'Predicted category']]

Unnamed: 0,keywordStringsCleanAfterFuzz,cleanFocusCategory,Predicted category
0,"[belgium, explosion, apartment building]",Catastrophe,Catastrophe
1,"[srebrenica genocide, bosnian war, yugoslav wa...",Politics,Catastrophe
2,"[sports, football, qatar 2022, world cup, fifa...",Sports,Sports
3,"[un, ocha, aid, yemen, afghanistan, martin gri...",Society,Offbeat
4,"[irish language, european union, official eu l...",Society,Media
...,...,...,...
195,"[female executives, executive boards, dax, gen...",Business,Business
196,"[coronavirus, covid-19, austria, karl nehammer]",Health,Health
197,"[russia, kazakhstan, almaty, moscow, unrest, p...",Politics,Offbeat
198,"[us special forces, albania]",Politics,Offbeat


In [59]:
# Check prediction
accuracy_sim_dw_categ = sum(df_dw_prediction['Predicted category'] == df_dw_prediction['cleanFocusCategory']) / len(df_dw_prediction)
print('Compared to similar categories:', accuracy_sim_dw_categ)

Compared to similar categories: 0.265


## Assess performance: Compare to similar DW keyword - category pairs

In [489]:
# Vectorisation of DW keywords
load = 1

if load: 
    vec_keywords_dw = pd.read_json('../data/interim/vectorised_clean_keywords_2022-01-01_2023-01-01.json', orient ='split', compression = 'infer')
else:
    # runs during 10 min
    lst_keywords_dw = df_dw['keywordStringsCleanAfterFuzz'].astype(str)
    vec_keywords_dw = get_vectors(lst_keywords_dw, wv)
    vec_keywords_dw.to_json('../data/interim/vectorised_clean_keywords_2022-01-01_2023-01-01.json', orient = 'split', compression = 'infer', index = 'true')

In [490]:
# Vectorisation of DW category
category_dw = focus_category_list
vec_category_dw = get_vectors(category_dw, wv)

In [519]:
## Google topic on DW keywords

# Vectorisation of Google keywords
lst_keywords_ggl = df_prediction['topic_type'].astype(str) # topic_type or topic_title or topic_title_type (best predictor: topic_type)
vec_keywords_ggl = get_vectors(lst_keywords_ggl, wv)

# Compute distances: google keywords vs DW keywords 
ggl_dw_word_distances = cosine_similarity(vec_keywords_ggl, vec_keywords_dw) 

# Get indices of similar (above threshold) DW keywords
distance_threshold = 0.4
ind_ggl_to_dw = [[ind for ind in range(len(ggl_word_list)) if ggl_word_list[ind] > distance_threshold] for ggl_word_list in ggl_dw_word_distances]

# Extract most frequet category for similar DW keywords
most_freq_categories = [df_dw['cleanFocusCategory'][lst_keywords_dw.index[ind_ggl_to_dw[i]]].mode()[0] if len(ind_ggl_to_dw[i]) > 0 else None for i in range(len(ind_ggl_to_dw))]

# Make a new column in df
df_prediction['Category of similar DW keywords'] = most_freq_categories

In [516]:
distance_threshold = 0.2

# Vectorisation of Google keywords
lst_keywords_ggl = df_prediction['topic_type'].astype(str) # topic_type or topic_title or topic_title_type (best predictor: topic_type)
vec_keywords_ggl = get_vectors(lst_keywords_ggl, wv)

# Compute distances: google keywords vs DW categories
ggl_dw_categ_distances = cosine_similarity(vec_keywords_ggl, vec_category_dw) 

# Get indices of similar (above threshold) DW keywords
closest_categ = [[category_dw[dist.argmax()], round(dist.max(),2)] for dist in ggl_dw_categ_distances]

# Make a new column in df
df_prediction['Most similar DW category'] = [row[0] if row[1] > distance_threshold else None for row in closest_categ]

# Check prediction
accuracy_sim_dw_categ = sum(df_prediction['Predicted category'] == df_prediction['Most similar DW category']) / len(df_prediction)
print('Compared to similar categories:', accuracy_sim_dw_categ)

Compared to similar categories: 0.47164948453608246


In [520]:
accuracy_sim_dw_keywords = sum(df_prediction['Predicted category'] == df_prediction['Category of similar DW keywords']) / len(df_prediction)
accuracy_sim_dw_categ = sum(df_prediction['Predicted category'] == df_prediction['Most similar DW category']) / len(df_prediction)
print('Accuracy:')
print('Compared to category of similar keywords:', accuracy_sim_dw_keywords) 
print('Compared to similar categories:', accuracy_sim_dw_categ)


Accuracy:
Compared to category of similar keywords: 0.3711340206185567
Compared to similar categories: 0.47164948453608246


In [496]:
df_prediction

Unnamed: 0,topic_title,topic_type,topic_title_type,Predicted category,Category of similar DW keywords,Most similar DW category
35,National Football League Playoffs,Championship,"National Football League Playoffs, Championship",Sports,Sports,Sports
42,Yellowstone,Drama series,"Yellowstone, Drama series",Media,,Offbeat
41,Secondary school,School category,"Secondary school, School category",Education,,Education
40,Week,Unit of time,"Week, Unit of time",Offbeat,,History
39,Tampa,City in Florida,"Tampa, City in Florida",Offbeat,Politics,Learning German
...,...,...,...,...,...,...
1963,Boris Johnson,Member of Parliament of the United Kingdom,"Boris Johnson, Member of Parliament of the Uni...",Politics,Politics,Society
1967,Thor,Film series,"Thor, Film series",Media,Culture,Offbeat
1968,Thor: Love and Thunder,2022 film,"Thor: Love and Thunder, 2022 film",Media,Culture,Offbeat
1965,Shinzo Abe,Former Prime Minister of Japan,"Shinzo Abe, Former Prime Minister of Japan",Politics,Politics,Learning German
