# Pipeline 1: Google-To-DW pipeline

The aim of this notebook is to be able to answer the questions: Is DW covering what customers want

Approach: Extract trending topics on Google and compare to what DW covers

<img src="../reports/illustrations/pipeline1.png" width=800 />

We tried 2 different approaches:

**Approach 1**: we used pre-trained models such as Chat GPT and zero-shot learning. \
This approach was overal less effective. Our attempts can be found in pipeline2_playground_approach1_*.ipynb

**Approach 2**: we trained our own models \
The most performing models are sumarised here. Our other attempts can be found in pipeline2_playground_approach2.ipynb

<img src="../reports/illustrations/pipeline2_approaches.png" width=800 />

In [1]:
# Import useful libraries
import pandas as pd
import os
import sys

# Import functions from source folder
sys.path.append('../src/') 
from data.preprocess_keywords import make_cleaned_keywords_df
from data.make_datasets import get_data, get_daily_trending_searches

In [2]:
# Specify wanted time range
start_date = '2019-01-01'
end_date = '2019-02-01'

# Where data files will be stored
path_to_data_files = '../data/interim/'

# Extract trending topics from Google

In [9]:
# Extracts trending topic from Google if the file does not exist, else loads it
# If error with the number of requests, change the header in make_datasets.py 
# (https://stackoverflow.com/questions/50571317/pytrends-the-request-failed-google-returned-a-response-with-code-429#:~:text=I%20am%20trustworthy.-,Solution,Visit%20the%20Google%20Trend%20page%20and%20perform%20a%20search%20for,-a%20trend%3B%20it)

google_file = path_to_data_files + start_date + '_' + end_date + '_World_daily_trending_searches.json'

if os.path.isfile(google_file) == False:
    df_google = get_daily_trending_searches(path_to_data_files, start_date, end_date = end_date)
else:
    df_google = pd.read_json(google_file, orient ='split', compression = 'infer') 

# Load DW data

In [15]:
# Clean data file in specific date range
clean_data_file = '../data/interim/clean_keywords_' + start_date + '_' + end_date + '.json'

# Generates the clean data file if it does not exist
if os.path.isfile(clean_data_file) == False:

    # Path to raw data
    data_file = '../data/raw/CMS_2010_to_June_2022_ENGLISH.json'

    # Load and extract data within time range
    df_subset = get_data(data_file, start_date, end_date)

    # Cleans keywords and saves data as a dataframe
    make_cleaned_keywords_df(df_subset, start_date, end_date)


# Loads the clean data file
df_dw = pd.read_json(clean_data_file, orient ='split', compression = 'infer')

# Remove rows witn no category
df_dw.dropna(subset=['cleanFocusCategory'], inplace = True)
df_dw.reset_index(drop = True, inplace = True)

# Models: map google keywords to DW category

In [13]:
# data from DW
df_dw.head()

Unnamed: 0,id,lastModifiedDate,Date,keywordStrings,cleanFocusParentCategory,cleanFocusCategory,teaser,keywordStringsCleanAfterFuzz
0,46912921,2019-01-01T03:57:28.904Z,2019-01-01,"[NASA, OSIRIS-REx, Bennu, asteroid]",Science,Science,The OSIRIS-REx spacecraft had arrived at the l...,"[nasa, osiris-rex, bennu, asteroid]"
1,46911356,2019-01-01T06:11:50.527Z,2019-01-01,"[English Channel, migration, boats, illegal im...",Law and Justice,Law and Justice,The UK is withdrawing patrol ships from overse...,"[english channel, migration, boats, illegal im..."
2,46909694,2019-01-01T06:14:35.563Z,2019-01-01,"[Brazil, Jair Bolsonaro, Chicago economics, Ha...",Politics,Politics,Brazil is inaugurating President Jair Bolsonar...,"[brazil, jair bolsonaro, chicago economics, ha..."
3,46912694,2019-01-01T08:26:11.599Z,2019-01-01,"[Japan, Tokyo, Harajuku, attack]",Law and Justice,Crime,"A man with an ""intent to murder"" has driven a ...","[japan, tokyo, harajuku, attack]"
4,46910092,2019-01-01T09:05:00.736Z,2019-01-01,"[Asia, Bangladesh, elections, Kamal Hossain, S...",Politics,Politics,"In an exclusive interview with DW, Kamal Hossa...","[asia, bangladesh, elections, kamal hossain, s..."


In [16]:
# Data from Google
df_google.head()

Unnamed: 0,value,formattedValue,link,topic_mid,topic_title,topic_type,date,location
0,174300,Breakout,/trends/explore?q=/m/02vxn&date=2019-01-02+201...,/m/02vxn,Film,Topic,2019-01-02,World
24,39500,Breakout,/trends/explore?q=/m/014dgf&date=2019-01-02+20...,/m/014dgf,Sales,Topic,2019-01-02,World
23,39700,Breakout,/trends/explore?q=/m/0jg24&date=2019-01-02+201...,/m/0jg24,Image,Topic,2019-01-02,World
22,39750,Breakout,/trends/explore?q=/m/0mgkg&date=2019-01-02+201...,/m/0mgkg,Amazon.com,E-commerce company,2019-01-02,World
21,39900,Breakout,/trends/explore?q=/m/0glpjll&date=2019-01-02+2...,/m/0glpjll,Instagram,Social networking service,2019-01-02,World


In [None]:
# let's load a pre-trained word2vec model from google- you may need to download this first
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g
pretrained_w2v_model_dir = '/home/marios/local_data_s2ds/GoogleNews-vectors-negative300.bin'
GoogleModel = gensim.models.KeyedVectors.load_word2vec_format(pretrained_w2v_model_dir, binary=True,)

In [None]:
#define corpus and do count vectorization 
corpus = [l for l in df['keywordStrings'].apply(lambda x: ', '.join(x))]
# Count vectorization of text
# Creating the vectorizer
vectorizer = CountVectorizer(stop_words='english')
# Converting the text to numeric data
X = vectorizer.fit_transform(corpus) 
# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
CountVectorizedData['Category']=df_dw['cleanFocusParentCategory'].values
print(CountVectorizedData.shape)
CountVectorizedData.head()
WordsVocab=CountVectorizedData.columns[:-1]
print(f'Number of words after count vectorization: {len(WordsVocab)}')


In [None]:
#this takes aboout an hour to run on my local machine, onnly rerun if you want to overwrite
W2Vec_Data=FunctionText2Vec(corpus)
output_dir = '/home/marios/local_data_s2ds/'
file_name = 'w2v_data_ALL_dirty.npy'
np.save(op.join(output_dir, file_name), W2Vec_Data)

In [None]:
#if you already have the embeddings you can load directly
word_embedding_dir = '/home/marios/local_data_s2ds/w2v_data_ALL_dirty.npy'
W2Vec_Data = pd.DataFrame(np.load(word_embedding_dir))

In [None]:
# Adding the target variable
W2Vec_Data.reset_index(inplace=True, drop=True)
W2Vec_Data['Category']=CountVectorizedData['Category']
DataForML=W2Vec_Data.copy()

# making the string categories into integers so we can do ML
primary_categories = np.unique(DataForML.Category).tolist()
primary_categories = sorted(primary_categories)
prim_cat_dict = {}
for i, cat in enumerate(primary_categories):
    prim_cat_dict[cat] = i+1
DataForML.Category = DataForML.Category.apply(lambda x: prim_cat_dict[x] if x in prim_cat_dict else x)
DataForML.Category  = DataForML.Category.astype(int)
np.unique(DataForML.Category)

In [None]:
# Separate Target Variable and Predictor Variables
TargetVariable=DataForML.columns[-1]
Predictors=DataForML.columns[:-1]
X=DataForML[Predictors].values
y=DataForML[TargetVariable].values
PredictorScaler=MinMaxScaler()
# Storing the fit object for later reference
PredictorScalerFit=PredictorScaler.fit(X)
# Generating the standardized values of X
X=PredictorScalerFit.transform(X)
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y)
# try SVC- fit, evaluate and save -OR load model furtheer down if you have it
model = SVC(C=20, kernel='rbf')
model.fit(X_train, y_train)
preds = model.predict(X_test)
evaluate_metrics(y_test, preds)
output_dir = '/home/marios/local_data_s2ds/'
file_name = 'SVC_model_category_classification.npy'
np.save(op.join(output_dir, file_name), model)

In [None]:
# if you'd like to optimize hyperparameters- run this over the weekend maybe

# params_grid = {
#     'C': [25, 50, 150],
#     'kernel': ['poly', 'rbf', 'sigmoid']
# }
# model = SVC()
# # Define a GridSearchCV to search the best parameters
# grid_search_balanced = GridSearchCV(estimator = model, 
#                            param_grid = params_grid, 
#                            scoring='f1',
#                            cv = 3, verbose = 1)
# # Search the best parameters with training data
# model_fit_balanced = grid_search_balanced.fit(X_train, y_train)
# best_params_balanced = grid_search_balanced.best_params_

In [None]:
#load model directly if you have it
model = np.load('/home/marios/local_data_s2ds/SVC_model_category_classification.npy', allow_pickle=True).tolist()

In [None]:
# Count vectorization of text for google searches extracted
# Creating the vectorizer
corpus2 = df_google['topic_title'].tolist()
vectorizer = CountVectorizer(stop_words='english')
# Converting the text to numeric data
X = vectorizer.fit_transform(corpus2) 
# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

print(CountVectorizedData.shape)
CountVectorizedData.head()
WordsVocab=CountVectorizedData.columns
print(f'Number of words after count vectorization: {len(WordsVocab)}')
W2Vec_Data2=FunctionText2Vec(corpus2)


In [None]:
# samee preparation done for training data being done for test data.
Predictors=W2Vec_Data2.columns
X_google_test=W2Vec_Data2[Predictors].values
PredictorScaler=MinMaxScaler()
PredictorScalerFit=PredictorScaler.fit(X_google_test)
# Generating the standardized values of X
X_google_test=PredictorScalerFit.transform(X_google_test)

#now let's predict using model 
preds = model.predict(X_google_test)
df_google['predicted_category'] = preds
#let's convert predictions from integeres back to the original categories to interpret them 
prim_num_dict = {}
for (num, cat) in zip(prim_cat_dict.values(), prim_cat_dict.keys()):
    prim_num_dict[num] = cat
prim_num_dict
df_google['predicted_category'] = df_google['predicted_category'].apply(lambda x: prim_num_dict[x] if x in prim_num_dict else x)
df_google['predicted_category']  = df_google['predicted_category'].astype(str)

In [None]:
#now let's find if each keyword was published on by DW- this takes 10-15 mins 
df_google['dw_published'] = np.zeros(df_google.shape[0])
for i in range(df_google.shape[0]):
    #slice original df based on dates
    print(i)
    start_date = df_google.date.iloc[i]
    end_date = df_google.date.iloc[i] + relativedelta(months=1)
    df_slice = truncate_data(df, start_date, end_date)
    kws_of_slice = [w for sublist in df_slice.keywordStrings for w in sublist]
    if str(df_google.topic_title[i]) in kws_of_slice:
        print(str(df_google.topic_title[i]))
        df_google.dw_published[i] = 1.0

In [None]:
#leet's group by eeach category, get the ratio of 1.0's and 0.0's and plot it 
cats = np.unique(df_google.predicted_category)
fig,ax = plt.subplots()
for cat in cats:
    print(cat)
    df_sub = df_google[df_google.predicted_category == cat]
    ratio = np.sum(df_sub.dw_published)/df_sub.shape[0]
    print(ratio)
    ax.bar(cat, ratio)
fig.autofmt_xdate(rotation=75)

ax.set_ylabel('Proportion of google trends covered \nby DW within a month after trending')
plt.show()

In [1]:
# functions
def truncate_data(df, start_date, end_date): 
    df['dt_lastModifiedDate'] = df.lastModifiedDate.apply(lambda x: d.datetime.strptime(x[:10], '%Y-%m-%d') if x is not None else x)
    df = df.sort_values(by = 'dt_lastModifiedDate')
    mask = np.logical_and(df['dt_lastModifiedDate']>=start_date, df['dt_lastModifiedDate']<end_date)
    df_subset = df[mask]
    return df_subset

# this function makes use thee pretrained google model to get word embeddings onn the vectorizeed input 
def FunctionText2Vec(inpTextData):
    # Converting the text to numeric data
    X = vectorizer.transform(inpTextData)
    CountVecData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()
    # Looping through each row for the data
    for i in range(CountVecData.shape[0]):
        # initiating a sentence with all zeros
        Sentence = np.zeros(300)
        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVecData.iloc[i , :]>=1]:
            #print(word)
            if word in GoogleModel.key_to_index.keys():    
                Sentence=Sentence+GoogleModel[word]
        # Appending the sentence to the dataframe
        W2Vec_Data=W2Vec_Data.append(pd.DataFrame([Sentence]))
    return(W2Vec_Data)

def evaluate_metrics(yt, yp):
    results_pos = {}
    results_pos['accuracy'] = accuracy_score(yt, yp)
    precision, recall, f_beta, _ = precision_recall_fscore_support(yt, yp, average='weighted')
    results_pos['recall'] = recall
    results_pos['precision'] = precision
    results_pos['f1score'] = f_beta
    return results_pos

In [None]:
## Model 2

In [None]:
# Model comparison ?

# Compare trending topics and DW covered categories