# Data analysis 

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations

In [None]:
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
#plt.set_loglevel(level = 'warning')
cmap = cm.get_cmap("coolwarm").copy()
#cmap.set_bad('lightgrey')  # Set NaN values to grey

In [None]:
PATH = "climate/backup/"
out_path = "climate/"

In [None]:
results = pd.read_csv(out_path+"result_data_test.csv")
results.shape

In [None]:
html = pd.read_csv(out_path+"html_test.csv")
html.shape

In [None]:
# Select only round 1 for analysis
results = pd.merge(results, html[["run_uid", "round_num", "test_num", "search_history", "user_input", "train_test"]], on="run_uid", how='left')
results = results[(results.round_num==1)]

In [None]:
results.shape

In [None]:
# set this in the cc/im_dataprep scripts.
results.loc[(results['domain']=='wikipedia.org'), 'final_category'] = 'bg_info'

In [None]:
results[~results.domain.isna()].final_category.value_counts(dropna=False, normalize=True)

### Source similarities

- For these analyses, we exclude rows that do not contain sources, i.e. people also ask, searches related, images, unknown
- Also exclude all rows with missing domains. 

Source similarities between all possible combinations of SERPs are calculated using calculate_source_sim.py

In [None]:
results[(results.domain.isna())].type.value_counts(dropna=False)

In [None]:
results_sim = results[(~results.domain.isna())].copy()
results.shape, results_sim.shape

In [None]:
results_sim.to_csv(out_path+"cc_results_r1.csv", index=False)

In [None]:
# number of combinations for content and source similarities each
print('Number of combinations', sum(1 for _ in combinations(results_sim['run_uid'].unique(), 2)))

### Content similarities

- General: Text, if missing Title (incl. submenu items)
- knowledge_panel_rhs: text 
- knowledge_featured_snippet: text 
- ad: text
- twitter_cards: text
- top_stories: title
- local_results: title (=name of location)
- videos: title
- people_also_ask: text
- searches_related: text
- scholarly_articles : title
- Components that do not contain any text are excluded, i.e. images, unknown, as well as rows that do not contain text or title

Content similarities calculated via calculate_content_sim.py

In [None]:
excl_cols = ["images", "unknown"]
text_cols = ["knowledge_panel_rhs", "knowledge_featured_snippet", "ad", "twitter_cards", "people_also_ask", "searches_related", "general"]
title_cols = ["top_stories", "local_results", "videos", "scholarly_articles"]

In [None]:
results_cont = results.copy()
# exclude these comp types
results_cont = results_cont[~results_cont.type.isin(excl_cols)]
# text is text column
results_cont.loc[results_cont.type.isin(text_cols), "text_for_sim"] = results_cont['text']
# text is title column
results_cont.loc[results_cont.type.isin(title_cols), "text_for_sim"] = results_cont['title']
# if text missing,substitute with title
results_cont.loc[results_cont.text_for_sim.isna(), "text_for_sim"] = results_cont['title']
# remove rows with missings
results_cont = results_cont[~results_cont.text_for_sim.isna()]
results.shape, results_cont.shape

In [None]:
results_cont.to_csv(out_path+"cc_results_cont_r1.csv", index=False)

In [None]:
# number of combinations for content and source similarities each
print('Number of combinations', sum(1 for _ in combinations(results_cont['run_uid'].unique(), 2)))

### Descriptive statistics

In [None]:
# without search history
results_nh = results_sim.copy()
results_nh = results_nh[results_nh['search_history']=='none']
results_nh.shape, results_nh.run_uid.nunique()

In [None]:
# make sure all go from 0 to X. 
rhs_panels_ids = results_nh[results_nh.type=='knowledge_panel_rhs'].run_uid.unique()
results_nh.loc[(results_nh['run_uid'].isin(rhs_panels_ids)), "cmpt_rank"] = results_nh['cmpt_rank'] + 1

In [None]:
# weights
results_nh['weight'] = 1 / (results_nh['cmpt_rank'] + 1)
results_nh[['run_uid','cmpt_rank', 'weight']]

In [None]:
weighted_frequencies = results_nh.groupby('final_category')['weight'].sum()
normalized_frequencies = (weighted_frequencies / weighted_frequencies.sum())#* 100
normalized_frequencies

In [None]:
#results_nh.final_category.value_counts(dropna=False)
results_nh.final_category.value_counts(dropna=False) / len(results_nh)

### Top information sources

In [None]:
freq = results_nh.groupby('user_input')['domain'].value_counts(normalize=True).reset_index(name='freq')
# weighted frequencies
w_freq = results_nh.groupby(['user_input', 'domain'])['weight'].sum()
norm_freq = (w_freq / w_freq.groupby('user_input').sum()).reset_index(name='w_freq')
freqs = pd.merge(freq, norm_freq, how='left', on=['user_input', 'domain']).sort_values(['user_input', 'w_freq'], ascending=False)
top_rows = freqs.groupby('user_input').apply(lambda x: x.nlargest(10, 'w_freq')).reset_index(drop=True).round(2)
top_rows = pd.merge(top_rows, results_nh[['domain', 'final_category']].drop_duplicates(), how='left', on='domain')
top_rows.to_latex(out_path+"tables/cc_top10_domains.txt")
top_rows

In [None]:
# Interpretation: Top 10 most frequent information sources of search results offered to user choice condition.

### Type of information sources

In [None]:
# Frequencies
freq = results_nh.groupby('user_input')['final_category'].value_counts(normalize=True).reset_index(name='freq')
# weighted frequencies
w_freq = results_nh.groupby(['user_input', 'final_category'])['weight'].sum()
norm_freq = (w_freq / w_freq.groupby('user_input').sum()).reset_index(name='w_freq')
freqs = pd.merge(freq, norm_freq, how='left', on=['user_input', 'final_category']).sort_values(['user_input', 'w_freq'], ascending=False)
freqs

In [None]:
# figure
new_column_names = {
    'inst': 'news',
    'bg_info': 'background information',
    'gateway': 'gateway',
    'not_news': 'other'
}

In [None]:
order = ["low", "neutral", "high"]

In [None]:
pivot = freqs.pivot(index='user_input', columns='final_category', values='freq')#.round(2)
pivot = pivot.rename(columns=new_column_names)
pivot = pivot.reindex(order)
w_pivot = freqs.pivot(index='user_input', columns='final_category', values='w_freq')#.round(2)
w_pivot = w_pivot.rename(columns=new_column_names)
w_pivot = w_pivot.reindex(order)
print(pivot)

In [None]:
# Assuming you have already defined 'w_pivot'
fig, ax = plt.subplots(figsize=(10, 4))

# Plotting only 'w_pivot'
plot = w_pivot.plot(kind='barh', stacked=True, ax=ax, width=0.8, position=0.5, legend=False)

# Customize appearance
#ax.legend(title='Information source types', bbox_to_anchor=(1.05, 1), loc='upper center')#loc='upper left')
ax.legend(title='Information source types', bbox_to_anchor=(0.5, -0.25), loc='upper center', ncol=4)

ax.set_xlim(right=1.0)
ax.set_ylim(bottom=-0.5)

for container in plot.containers:
    for i, value in enumerate(container.datavalues):
        if value.round(2) > 0:
            ax.bar_label(container, label_type='center', fontsize=12, color='black', fmt='%.2f', labels=[f'{value.round(2)}' if value.round(2) > 0 else '' for value in container.datavalues])

ax.tick_params(axis='y',labelsize=12)
ax.tick_params(axis='x',labelsize=12)
ax.set_ylabel('User choice', fontsize=12)
ax.set_xlabel('Frequency', fontsize=12)

fig.tight_layout(pad=2.0)
plt.savefig(out_path + 'figures/cc_typesource_barh.eps', format='eps', bbox_inches='tight')
plt.show()


### Frequency of component types

In [None]:
results_cmpts = results.copy()
# all cmpts only one row
results_cmpts = results_cmpts.drop_duplicates(["run_uid", "cmpt_rank"], keep='first')
# only without search history
results_cmpts = results_cmpts[results_cmpts.search_history=='none']
results_cmpts.shape

In [None]:
# make sure all go from 0 to X. 
rhs_panels_ids = results_cmpts[results_cmpts.type=='knowledge_panel_rhs'].run_uid.unique()
results_cmpts.loc[(results_cmpts['run_uid'].isin(rhs_panels_ids)), "cmpt_rank"] = results_cmpts['cmpt_rank'] + 1

In [None]:
results_cmpts['weight'] = 1 / (results_cmpts['cmpt_rank'] + 1)
results_cmpts[['run_uid','cmpt_rank', 'weight']]

In [None]:
# display of components (0/1) (dummies)
dummy_df = pd.get_dummies(results_cmpts[['type', 'run_uid']], columns=['type']).astype(int)
grouped = dummy_df.groupby('run_uid').sum().astype(int)
components = grouped.applymap(lambda x: 1 if x >= 1 else 0).reset_index()
components.shape

In [None]:
components = pd.merge(components, results_cmpts.drop_duplicates('run_uid')[['run_uid', 'user_input']], on='run_uid', how='left')
components.shape

In [None]:
features = [c for c in components.columns if c.startswith('type')]
#features

In [None]:
freqs = components.groupby('user_input')[features].mean()
freqs

In [None]:
## Add scholarly articles for immigration

# Define custom order and new names for columns
custom_order = ['type_knowledge_panel_rhs', 'type_local_results', 'type_top_stories', 'type_twitter_cards', 'type_videos', 'type_knowledge_featured_snippet', 'type_searches_related', 'type_images', 'type_ad', 'type_people_also_ask', 'type_general']
column_rename_dict = {'type_knowledge_panel_rhs':'Knowledge Panel', 'type_local_results':'Local results', 'type_top_stories':'Top stories', 
                'type_twitter_cards':'Twitter', 'type_videos':'Videos', 'type_knowledge_featured_snippet':'Featured snippet',
                'type_scholarly_articles':'Scholarly articles', 'type_searches_related':'Related searches', 
                'type_images':'Images', 'type_ad':'Advertisements', 'type_people_also_ask':'People also ask', 'type_general':'General'}

In [None]:
freqs = freqs.reindex(index=["high", "neutral", "low"], columns=custom_order)
freqs = freqs.rename(columns=column_rename_dict)
freqs

In [None]:
freqs.T.round(2).to_latex(out_path+"tables/cc_freq_type_userinput.txt")

In [None]:
select_features = ["Knowledge Panel", "Local results", "Top stories", "Twitter", "Videos", "Featured snippet"]
freqs = freqs[select_features]

In [None]:
plt.figure(figsize=(6, 3))

sns.heatmap(freqs, cmap=cmap, cbar=True, annot=True, fmt='.2f', annot_kws={"size":12}, cbar_kws={'pad': 0.02}, vmax=1)
plt.yticks(va="center")
plt.ylabel('User choice', fontsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.xticks(rotation=45, ha="right", fontsize=12)
plt.savefig(out_path+'figures/cc_type_heatmap.eps', format='eps', bbox_inches='tight')
plt.show()
