# Browsing sequences approach

Note: Note that the browsing history data cannot be shared publicly due to privacy reasons. 

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopwords = stopwords.words('dutch')
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('dutch')

In [None]:
df = pd.read_csv("browser2022.csv")
df.shape

In [None]:
df.submission_id.nunique()

In [None]:
df.final_category.value_counts()

#### Add annotations

In [None]:
# file with manual labels.
labels = pd.read_csv("data/annotations.csv")
labels.shape

In [None]:
labels.Q1_checked.value_counts(dropna=False)

In [None]:
def preprocess(x):
    return re.sub(r'[^\w\s]|_', '', x.lower()).strip()

In [None]:
# clean strings for perfect matching
labels['q_match'] = labels['q'].apply(preprocess)
df['q_match'] = df[~df["q"].isna()]['q'].apply(preprocess)

In [None]:
# check overlap
labels['exists'] = labels['q_match'].isin(df['q_match'])
labels.exists.value_counts(dropna=False)

In [None]:
## add batch nr from other df
# file with manual labels.
org = pd.read_csv("annotations_18012023.csv")
print(org.shape)
org['q_match'] = org['q'].apply(preprocess)
# keep first coded duplicated query.
org['RecordedDate'] = pd.to_datetime(org['RecordedDate'])
org = org.sort_values(['RecordedDate'], ascending=True)
org = org.drop_duplicates('q_match', keep='first')
print(org.q.nunique(), org.shape)
## add batch nr to quality checked annotations
labels = pd.merge(labels, org[['q_match', 'batch']], how='left', on='q_match')

In [None]:
labels.shape

In [None]:
labels.batch.value_counts()

In [None]:
labels.groupby('batch')['Q1_checked'].value_counts(normalize=True)

In [None]:
labels.Q1_checked.value_counts()

In [None]:
df = pd.merge(df, labels[["q_match", "Q1_checked", "batch"]], on='q_match', how='left')
df.shape

In [None]:
df.head()

In [None]:
## add predicted political news headlines
# Note that we cannot share this dataset, but the classifier used here and data are publicly available (see paper).
pol_news = pd.read_csv("predicted_headlines12102022.csv")
pol_news.shape

In [None]:
pol_news.head()

In [None]:
df = pd.merge(df, pol_news[['title', 'prediction']], how='left', on='title')
df.shape

In [None]:
# make sure only institutional news get these labels.
df['prediction']=np.where((df.final_category.isin(['inst', 'non_inst']))&(df.prediction==1),1,0)

In [None]:
df.groupby('final_category')['prediction'].value_counts(dropna=False)

In [None]:
## able to label 2286 non_inst news visits, 76926 inst news visits as political. 

#### Search to news paths

In [None]:
# make datetime object
df['date_dt'] = pd.to_datetime(df.time, unit='us', utc=True).map(lambda x: x.tz_convert('Europe/Amsterdam'))
# sort records again to be sure. (from old to new)
df = df.sort_values(['submission_id', 'date_dt'], ascending=True)

In [None]:
# recalculate time difference in seconds to previous row within submission id
df['diff_secs'] = df.groupby(['submission_id'])["date_dt"].diff().dt.total_seconds()
df['diff_mins'] = df.diff_secs/60

In [None]:
# na means that no search was performed.
df['text_search'] = df['text_search'].fillna(0)

In [None]:
# transition types in general
df.transition.value_counts(dropna=False)

In [None]:
# transition type for searches
df[df.text_search==1].transition.value_counts(dropna=False)

# Make variables

In [None]:
# next domain, category, query and transition type 
df['domain_next'] = df.groupby('submission_id')['domain'].shift(-1)
df['final_category_next'] = df.groupby('submission_id')['final_category'].shift(-1)
df['prediction_next'] = df.groupby('submission_id')['prediction'].shift(-1)
df['transition_next'] = df.groupby('submission_id')['transition'].shift(-1)
df['q_next'] = df.groupby('submission_id')['q'].shift(-1)
df['diff_secs_next'] = df.groupby('submission_id')['diff_secs'].shift(-1)
df['text_search_next'] = df.groupby('submission_id')['text_search'].shift(-1)

In [None]:
df[['domain', 'domain_next', 'final_category', 'final_category_next', 'prediction', 'prediction_next', 'diff_secs', 'diff_secs_next', 'text_search', 'text_search_next']].head(10)

__serach-to-search__ --> Next website
1. is a search
2. accessed within 30 seconds
3. is not a reload

__no-visit30__ --> Next website
1. is not a another search and is not transition type link, OR
2. is *not* accessed within 30 seconds of the search


In [None]:
news = ["inst", "non_inst"]
news_bg = ["inst", "non_inst", "bg_info"]

In [None]:
# search-to-search
s2s = (df['text_search']==1)&(df['text_search_next']==1)&(df['diff_secs_next']<=30)&(df['transition_next']!='RELOAD')
df['search_to_search'] = np.where(s2s, 1, 0)
df[df.text_search==1]['search_to_search'].value_counts(dropna=False)

In [None]:
# this makes sense.
df[df['search_to_search']==1].domain_next.value_counts(dropna=False)

In [None]:
# no_visit30 --> if it is not accessed via a link and it is not a search.
nv30_1 = (df['text_search']==1)&(df['transition_next']!='LINK')&(df['text_search_next']==0)
df['no_visit30'] = np.where(nv30_1, 1, 0)
df[df.text_search==1]['no_visit30'].value_counts(dropna=False)

In [None]:
# # if search was not classified as no_visit30, but more htan 30 seconds 
#--> also label as no visit, otherwise take original value
nv30_2 = (df['text_search']==1)&(df['diff_secs_next']>30)&(df['no_visit30']==0)
df['no_visit30'] = np.where(nv30_2, 1, df['no_visit30']) 
df[df.text_search==1]['no_visit30'].value_counts(dropna=False)

In [None]:
df[df.text_search==1]['no_visit30'].value_counts(dropna=False, normalize=True)

In [None]:
# examples of domains that are no visits.
# google is possible, because it can still be accessed outside of 30 seconds
df[(df.text_search==1)&(df['no_visit30']==1)].domain_next.value_counts(dropna=False)

In [None]:
### how many lead to website other than inst, non-inst or bg info but is not search? (given the same operationalisation as paths and is not search)
mask = (df['text_search']==1)&(~df['final_category_next'].isin(news_bg))&(df['transition_next']=='LINK')&(df['diff_secs_next']<=30)&(df['text_search_next']==0)
df["s_to_other_30"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_other_30'].value_counts(dropna=False)

In [None]:
df[df["s_to_other_30"]==1].domain_next.value_counts(dropna=False)

In [None]:
df.groupby('no_visit30').search_to_search.value_counts(dropna=False)

In [None]:
# 26% of all searches lead to another search (and are not simply reloads)
print(df[df.text_search==1].search_to_search.value_counts(dropna=False))
print(df[df.text_search==1].search_to_search.value_counts(normalize=True, dropna=False))

In [None]:
# 24% of searches do not lead to another search nor a visit
print(df[df.text_search==1].no_visit30.value_counts(dropna=False))
print(df[df.text_search==1].no_visit30.value_counts(normalize=True, dropna=False))

In [None]:
# 47% of searches (n = 3660) lead to a website other than news or bg info within 30 seconds.
print(df[df.text_search==1].s_to_other_30.value_counts(dropna=False))
print(df[df.text_search==1].s_to_other_30.value_counts(normalize=True, dropna=False))

In [None]:
df.groupby('no_visit30').s_to_other_30.value_counts(dropna=False)

# Paths 
Search-to-news occurs when a news-related website visited is a direct result of the search. In practice this is based on three variables. The row immediately sequential a text search is..
1. of the transition type LINK (transition type)
2. inst, non-inst, and/or background info (domain category)
3. accessed within 30 or 5 minutes (difference between visit timestamps)

In [None]:
# 30 seconds
mask = (df['text_search']==1)&(df['final_category_next']=="inst")&(df['transition_next']=='LINK')&(df['diff_secs_next']<=30)
df["s_to_inst_news_30"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_inst_news_30'].value_counts(dropna=False)

In [None]:
df[df.text_search==1]['s_to_inst_news_30'].value_counts(dropna=False, normalize=True)

In [None]:
mask = (df['text_search']==1)&(df['final_category_next']=="inst")&(df['transition_next']=='LINK')&(df['diff_secs_next']<=30)&(df['prediction_next']==1)
df["s_to_pol_inst_news_30"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_pol_inst_news_30'].value_counts(dropna=False)

In [None]:
df[df.text_search==1]['s_to_pol_inst_news_30'].value_counts(dropna=False, normalize=True)

In [None]:
mask = (df['text_search']==1)&(df['final_category_next'].isin(news))&(df['transition_next']=='LINK')&(df['diff_secs_next']<=30)
df["s_to_news_30"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_news_30'].value_counts(dropna=False)

In [None]:
df[df.text_search==1]['s_to_news_30'].value_counts(dropna=False,normalize=True)

In [None]:
mask = (df['text_search']==1)&(df['final_category_next'].isin(news))&(df['transition_next']=='LINK')&(df['diff_secs_next']<=30)&(df['prediction_next']==1)
df["s_to_pol_news_30"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_pol_news_30'].value_counts(dropna=False)

In [None]:
df[df.text_search==1]['s_to_pol_news_30'].value_counts(dropna=False, normalize=True)

In [None]:
mask = (df['text_search']==1)&(df['final_category_next'].isin(news_bg))&(df['transition_next']=='LINK')&(df['diff_secs_next']<=30)
df["s_to_news_bg_30"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_news_bg_30'].value_counts(dropna=False)

In [None]:
df[df.text_search==1]['s_to_news_bg_30'].value_counts(dropna=False, normalize=True)

In [None]:
mask = (df['text_search']==1)& ((df['final_category_next']=='bg_info') | (df['prediction_next']==1))&(df['transition_next']=='LINK')&(df['diff_secs_next']<=30)
df["s_to_pol_news_bg_30"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_pol_news_bg_30'].value_counts(dropna=False)

In [None]:
df[df.text_search==1]['s_to_pol_news_bg_30'].value_counts(dropna=False, normalize=True)

In [None]:
# 5 minutes
mask = (df['text_search']==1)&(df['final_category_next']=="inst")&(df['transition_next']=='LINK')&(df['diff_secs_next']<=300)
df["s_to_inst_news_5"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_inst_news_5'].value_counts(dropna=False)

In [None]:
mask = (df['text_search']==1)&(df['final_category_next'].isin(news))&(df['transition_next']=='LINK')&(df['diff_secs_next']<=300)
df["s_to_news_5"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_news_5'].value_counts(dropna=False)

In [None]:
mask = (df['text_search']==1)&(df['final_category_next'].isin(news_bg))&(df['transition_next']=='LINK')&(df['diff_secs_next']<=300)
df["s_to_news_bg_5"] = np.where(mask, 1, 0)
df[df.text_search==1]['s_to_news_bg_5'].value_counts(dropna=False)

# Evaluation

In [None]:
from sklearn.metrics import classification_report

In [None]:
def make_metrics(lst, df):
    new = []
    for x in lst:
        dct = classification_report(y_true=df.Q1_checked, y_pred=df[x], output_dict=True)
        dct = dct['1.0']
        dct.update({'model':x})
        new.append(dct)
    new=pd.DataFrame(new).set_index('model')
    return new

### test set

In [None]:
# test dataset
X_test = np.load("train_test_split/X_test.npy", allow_pickle=True).tolist()
y_test = np.load("train_test_split/y_test.npy", allow_pickle=True).tolist()
print(len(y_test))

In [None]:
X_test = [preprocess(x) for x in X_test]
df2 = pd.DataFrame({"test_set":X_test})

In [None]:
merged = pd.merge(df2, df, left_on='test_set', right_on='q_match', how='left')
merged.shape

In [None]:
# matching worked well
merged.test_set.nunique()

In [None]:
merged.batch.value_counts(dropna=False)

In [None]:
preds_full = merged[['s_to_news_bg_30', 'test_set', 'domain_next', 'final_category_next']].copy()
preds_full = preds_full.rename(columns={"s_to_news_bg_30":"sequences", "test_set":"X_test", "domain_next":"domain_next", 'final_category_next':'final_category_next'})
preds_full.to_csv('sequences_preds_full.csv',index=False)
print(preds_full.shape)
preds_full

In [None]:
# only batch 1 and 2, because batch 3 (and 4) oversampled these types of queries.
b1_2 = merged[merged.batch.isin(["b1", "b2"])]
b1_2.batch.value_counts(dropna=False)

In [None]:
# only consider text searches and not the other ones.
b1_2 = b1_2[b1_2['text_search']==1]
b1_2.shape

In [None]:
b1_2.Q1_checked.value_counts()

In [None]:
lst = ['s_to_inst_news_30', 's_to_pol_inst_news_30', 's_to_news_30', 's_to_pol_news_30', 's_to_news_bg_30',
       "s_to_pol_news_bg_30", 's_to_inst_news_5', 's_to_news_5', 's_to_news_bg_5']
report = make_metrics(lst, b1_2)
report.round(2).to_latex("search_to_news_metrics_13062023.txt")
report