# (Dis)similarity analysis
Analysis of the output of the pairwise similarity scores, IRBO and ICS.

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
#plt.set_loglevel(level = 'warning')
cmap = cm.get_cmap("coolwarm").copy()
#cmap.set_bad('lightgrey')  # Set NaN values to grey

In [None]:
im_html = pd.read_csv("immigration/html_test.csv")
cc_html = pd.read_csv("climate/html_test.csv")
print(im_html.shape, cc_html.shape)

## Source similarity
Source similarities between all possible combinations of SERPs are calculated using calculate_source_sim.py

In [None]:
im_source_sim = pd.read_csv("immigration/im_source_sim.csv")
im_source_sim.shape

In [None]:
im_source_sim['search_history1'] = im_source_sim['search_history1'].replace('unint', 'unrel.')
im_source_sim['search_history2'] = im_source_sim['search_history2'].replace('unint', 'unrel.')

In [None]:
im_source_sim.head()

In [None]:
cc_source_sim = pd.read_csv("climate/cc_source_sim.csv")
cc_source_sim.shape

In [None]:
cc_source_sim['search_history1'] = cc_source_sim['search_history1'].replace('unint', 'unrel.')
cc_source_sim['search_history2'] = cc_source_sim['search_history2'].replace('unint', 'unrel.')

In [None]:
cc_source_sim.head()

In [None]:
im_source_sim.run_uid1.nunique(), cc_source_sim.run_uid1.nunique()

In [None]:
# inverted scores
im_source_sim['irbo0.95_value'] = 1 - im_source_sim['rbo0.95_value']
im_source_sim['irbo0.8_value'] = 1 - im_source_sim['rbo0.8_value']
cc_source_sim['irbo0.95_value'] = 1 - cc_source_sim['rbo0.95_value']
cc_source_sim['irbo0.8_value'] = 1 - cc_source_sim['rbo0.8_value']

### Source -  search history 

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(15, 15))

issues = ['Immigration', 'Climate change']
#im_user_inputs = ['pro', 'neutral', 'anti']
im_user_inputs = ['anti', 'neutral', 'pro']
#cc_user_inputs = ['high', 'neutral', 'low']
cc_user_inputs = ['low', 'neutral', 'high']
values = ['irbo0.95_value', 'irbo0.8_value']
value_names = ['IRBO (p = 0.95)', 'IRBO (p = 0.8)']

cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 


for issue_index, issue in zip([0,2], issues):
    if issue == 'Immigration':
        df = im_source_sim
        user_input_values = im_user_inputs
        sh_order = ['anti', 'neutral', 'pro', 'mixed', 'unrel.', 'none']
    elif issue == 'Climate change':
        df = cc_source_sim
        user_input_values = cc_user_inputs
        sh_order = ['low', 'neutral', 'high', 'mixed', 'unrel.', 'none']

    # rows
    for i, user_input in enumerate(user_input_values):
        df_filtered = df[(df['user_input1']==user_input)&(df['user_input2']==user_input)]

        # columns
        for j, value in enumerate(values):
            column_index = j + issue_index
            #print(issue, value, column_index, user_input, i)
            ax = axes[i, column_index]

            source_history = df_filtered.groupby(['search_history1', 'search_history2'])[value].mean().unstack()
            source_history = source_history.reindex(index=sh_order, columns=sh_order)
            mask = np.triu(np.ones_like(source_history, dtype=bool), k=1) # masking upper triangle, excl diagonal
            plot = sns.heatmap(source_history, cmap=cmap, cbar=True, annot=True, fmt='.2f', ax=ax,
                              vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask)
            ax.set_ylabel('')
            ax.set_xlabel('')
        
            if i == 0:
                ax.set_title(value_names[j])  # Set column titles

cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label

# row titles
for i, ax in enumerate(axes):
    ax[0].set_ylabel(im_user_inputs[i], fontsize=14)
for i, ax in enumerate(axes):
    ax[2].set_ylabel(cc_user_inputs[i], fontsize=14)
                
# Set section headers above the middle columns (0 and 1 and 2 and 3)
fig.text(0.25, 1, 'Immigation', fontsize=14, ha='center')
fig.text(0.75, 1, 'Climate Change', fontsize=14, ha='center')

plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/sourcesim_hist_matrix.eps', format='eps', bbox_inches='tight')
plt.show()

## Content similarity

In [None]:
im_content_sim = pd.read_csv("immigration/im_content_sim.csv")
cc_content_sim = pd.read_csv("climate/cc_content_sim.csv")
print(im_content_sim.shape, cc_content_sim.shape)

In [None]:
im_content_sim['search_history1'] = im_content_sim['search_history1'].replace('unint', 'unrel.')
im_content_sim['search_history2'] = im_content_sim['search_history2'].replace('unint', 'unrel.')

In [None]:
cc_content_sim['search_history1'] = cc_content_sim['search_history1'].replace('unint', 'unrel.')
cc_content_sim['search_history2'] = cc_content_sim['search_history2'].replace('unint', 'unrel.')

In [None]:
im_content_sim.head()

In [None]:
cc_content_sim.head()

In [None]:
# inverted scores
im_content_sim['icos_value'] = 1 - im_content_sim['cos_value']
cc_content_sim['icos_value'] = 1 - cc_content_sim['cos_value']

### Content -  search history

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10, 15))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 

for j, issue in enumerate(issues):
    if issue == 'Immigration':
        df = im_content_sim
        user_input_values = im_user_inputs
        sh_order = ['anti', 'neutral', 'pro', 'mixed', 'unrel.', 'none']
    elif issue == 'Climate change':
        df = cc_content_sim
        user_input_values = cc_user_inputs
        sh_order = ['low', 'neutral', 'high', 'mixed', 'unrel.', 'none']

    for i, user_input in enumerate(user_input_values):
        ax = axes[i, j] 
        
        df_filtered = df[(df['user_input1']==user_input)&(df['user_input2']==user_input)]
        content_history = df_filtered.groupby(['search_history1', 'search_history2'])['icos_value'].mean().unstack()
        content_history = content_history.reindex(index=sh_order, columns=sh_order)
        mask = np.triu(np.ones_like(content_history, dtype=bool), k=1) # masking upper triangle, excl diagonal
        plot = sns.heatmap(content_history, cmap=cmap, cbar=True, annot=True, fmt='.2f', ax=ax,
                       vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask)
    
        ax.set_ylabel('')
        ax.set_xlabel('')
        ax.set_ylabel(user_input, fontsize=14)
        if i == 0:
            ax.set_title(issue, fontsize=14)  # Set column titles
            
        
cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label
plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/contentsim_hist_matrix.eps', format='eps', bbox_inches='tight')
plt.show()

### Source -  user input

In [None]:
# only comparisons of agents without browser history (clean slates)
im_source_sim_none = im_source_sim[(im_source_sim['search_history1']=='none')&(im_source_sim['search_history2']=='none')]
cc_source_sim_none = cc_source_sim[(cc_source_sim['search_history1']=='none')&(cc_source_sim['search_history2']=='none')]
print(im_source_sim_none.run_uid1.nunique(), cc_source_sim_none.run_uid1.nunique())

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(15, 5))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 

for issue_index, issue in zip([0,2], issues):
    if issue == 'Immigration':
        df = im_source_sim_none
        order = ["anti", "neutral", "pro"]
    elif issue == 'Climate change':
        df = cc_source_sim_none
        order = ["low", "neutral", "high"]

        
    # columns
    for j, value in enumerate(values):
        column_index = j + issue_index
        ax = axes[column_index]
    
        source_input = df.groupby(['user_input1', 'user_input2'])[value].mean().unstack()
        source_input = source_input.reindex(index=order, columns=order)
        mask = np.triu(np.ones_like(source_input, dtype=bool), k=1) # masking upper triangle, excl diagonal
        plot = sns.heatmap(source_input, cmap=cmap, cbar=True, annot=True, fmt='.2f', ax=ax,
                           vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask)
        ax.set_ylabel('')
        ax.set_xlabel('')
        #ax.set_title(value_names[i], fontsize=14)
        ax.set_title(value_names[j])


cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label
                
# Set section headers above the middle columns (0 and 1 and 2 and 3)
fig.text(0.25, 1, 'Immigation', fontsize=14, ha='center')
fig.text(0.75, 1, 'Climate Change', fontsize=14, ha='center')

plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/sourcesim_input_matrix.eps', format='eps', bbox_inches='tight')
plt.show()


### Content -  user input

In [None]:
# only comparisons of agents without browser history (clean slates)
im_content_sim_none = im_content_sim[(im_content_sim['search_history1']=='none')&(im_content_sim['search_history2']=='none')]
cc_content_sim_none = cc_content_sim[(cc_content_sim['search_history1']=='none')&(cc_content_sim['search_history2']=='none')]
print(im_content_sim_none.run_uid1.nunique(), cc_content_sim_none.run_uid1.nunique())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 

for j, issue in enumerate(issues):
    if issue == 'Immigration':
        df = im_content_sim_none
        order = ["anti", "neutral", "pro"]
    elif issue == 'Climate change':
        df = cc_content_sim_none
        order = ["low", "neutral", "high"]
        
    ax = axes[j]
    content_input = df.groupby(['user_input1', 'user_input2'])['icos_value'].mean().unstack()
    content_input = content_input.reindex(index=order, columns=order)
    mask = np.triu(np.ones_like(content_input, dtype=bool), k=1) # masking upper triangle, excl diagonal
    sns.heatmap(content_input, cmap=cmap, cbar=True, annot=True, fmt='.2f', ax = ax,
                vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask)
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.set_title(issue, fontsize=14)

cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label
plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/contentsim_input_matrix.eps', format='eps', bbox_inches='tight')
plt.show()

## Content + Source - User input (+ SDs)

Values are the mean scores for user_input (across algorithmic curation conditions). Values in brackets is the standard deviation of the means of algorithmic curation conditions. The SD is (rounded) zero, indicating barely any variation in mean dissimilarity scores, therefore none can be attributed to algorithmic curation.

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(15, 5))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 
value_names = ['IRBO (p = 0.8)', 'ICS']

for issue_index, issue in zip([0,2], issues):
    if issue == 'Immigration':
        df = [im_source_sim, im_content_sim]
        order = ["anti", "neutral", "pro"]
    elif issue == 'Climate change':
        df = [cc_source_sim, cc_content_sim]
        order = ["low", "neutral", "high"]

        
    # columns
    for j, value in enumerate(['irbo0.8_value', 'icos_value']):
        column_index = j + issue_index
        ax = axes[column_index]
        
        if value == 'irbo0.8_value':
            source_input = df[0].groupby(['user_input1', 'user_input2'])[value].mean().unstack()
            source_input = source_input.reindex(index=order, columns=order)
            std_input = df[0].groupby(['user_input1','user_input2', 'search_history1', 'search_history2'])[value].mean().reset_index(name='mean')
            std_input = std_input.groupby(['user_input1', 'user_input2'])['mean'].std().unstack().reindex(index=order, columns=order)
            #print(std_input)
        elif value == 'icos_value':
            source_input = df[1].groupby(['user_input1', 'user_input2'])[value].mean().unstack()
            source_input = source_input.reindex(index=order, columns=order)
            std_input = df[1].groupby(['user_input1','user_input2', 'search_history1', 'search_history2'])[value].mean().reset_index(name='mean')
            std_input = std_input.groupby(['user_input1', 'user_input2'])['mean'].std().unstack().reindex(index=order, columns=order)
        mask = np.triu(np.ones_like(source_input, dtype=bool), k=1) # masking upper triangle, excl diagonal
        plot = sns.heatmap(source_input, cmap=cmap, cbar=True, ax=ax,
                           vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask)  #annot=True, fmt='.2f', annot_kws={"size": 8, "mask": mask})
        
        for i, index in enumerate(source_input.index):
            for j2, column in enumerate(source_input.columns):
                val = source_input.loc[index, column]
                std_val = std_input.loc[index, column]
                # do not annotate masked values
                if not mask[i, j2]:
                    ax.text(j2 + 0.5, i + 0.5, f"{val:.2f}\n({std_val:.2f})", ha='center', va='center', fontsize=14)

        ax.set_ylabel('')
        ax.set_xlabel('')
        ax.tick_params(axis='y',labelsize=14)
        ax.tick_params(axis='x',labelsize=14)
        #ax.set_title(value_names[i], fontsize=14)
        ax.set_title(value_names[j], fontsize=15)


cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label
                
# Set section headers above the middle columns (0 and 1 and 2 and 3)
fig.text(0.25, 1, 'Immigation', fontsize=15, ha='center')
fig.text(0.75, 1, 'Climate Change', fontsize=15, ha='center')



plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/sourcesim_input_matrix_std.eps', format='eps', bbox_inches='tight')
plt.show()


In [None]:
## Appendix figure: IRBO (p = 0.95) scores
fig, ax = plt.subplots(1, 2, figsize=(8, 5))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02])  # Adjust these values according to your needs


for issue_index, issue in zip([0, 1], issues):  # Assuming only one value in the 'values' list
    if issue == 'Immigration':
        df = [im_source_sim, im_content_sim]
        order = ["anti", "neutral", "pro"]
    elif issue == 'Climate change':
        df = [cc_source_sim, cc_content_sim]
        order = ["low", "neutral", "high"]

    # Assuming only one value in the 'values' list
    value = 'irbo0.95_value'  # Change this according to your needs
    
    source_input = df[0].groupby(['user_input1', 'user_input2'])[value].mean().unstack()
    source_input = source_input.reindex(index=order, columns=order)
    std_input = df[0].groupby(['user_input1', 'user_input2', 'search_history1', 'search_history2'])[value].mean().reset_index(name='mean')
    std_input = std_input.groupby(['user_input1', 'user_input2'])['mean'].std().unstack().reindex(index=order, columns=order)
    
    mask = np.triu(np.ones_like(source_input, dtype=bool), k=1)
    plot = sns.heatmap(source_input, cmap='coolwarm', cbar=True, ax=ax[issue_index],
                       vmin=0, vmax=1, cbar_ax=cbar_ax, mask=mask)
    
    for i, index in enumerate(source_input.index):
        for j2, column in enumerate(source_input.columns):
            val = source_input.loc[index, column]
            std_val = std_input.loc[index, column]
            if not mask[i, j2]:
                ax[issue_index].text(j2 + 0.5, i + 0.5, f"{val:.2f}\n({std_val:.2f})", ha='center', va='center', fontsize=14)

    ax[issue_index].set_ylabel('')
    ax[issue_index].set_xlabel('')
    ax[issue_index].tick_params(axis='y',labelsize=14)
    ax[issue_index].tick_params(axis='x',labelsize=14)
    #ax[issue_index].set_title(value_names[issue_index])

# Set section headers above the columns
fig.text(0.25, 1, 'Immigration', fontsize=15, ha='center')
fig.text(0.75, 1, 'Climate Change', fontsize=15, ha='center')

cbar = plt.colorbar(ax[0].collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)

plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.savefig('sim_figures/sourcesim_input_matrix_std_irbo95.eps', format='eps', bbox_inches='tight')
plt.show()


### Appendix figure: Source + Content -- search history

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(15, 15))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 

values = ['irbo0.8_value', 'icos_value']
value_names = ['IRBO (p = 0.8)', 'ICS']
for issue_index, issue in zip([0,2], issues):
    if issue == 'Immigration':
        df = [im_source_sim, im_content_sim]
        user_input_values = im_user_inputs
        sh_order = ['anti', 'neutral', 'pro', 'mixed', 'unrel.', 'none']
    elif issue == 'Climate change':
        df = [cc_source_sim, cc_content_sim]
        user_input_values = cc_user_inputs
        sh_order = ['low', 'neutral', 'high', 'mixed', 'unrel.', 'none']

    # rows
    for i, user_input in enumerate(user_input_values):   
        #print(df_filtered.head())
        # columns
        for j, value in enumerate(values):
            if value == 'irbo0.8_value':
                tmp = df[0]  
            elif value == 'icos_value':
                tmp = df[1]
            df_filtered = tmp[(tmp['user_input1']==user_input)&(tmp['user_input2']==user_input)]
            column_index = j + issue_index
            #print(issue, value, column_index, user_input, i)
            ax = axes[i, column_index]

            source_history = df_filtered.groupby(['search_history1', 'search_history2'])[value].mean().unstack()
            source_history = source_history.reindex(index=sh_order, columns=sh_order)
            mask = np.triu(np.ones_like(source_history, dtype=bool), k=1) # masking upper triangle, excl diagonal
            plot = sns.heatmap(source_history, cmap=cmap, cbar=True, annot=True, fmt='.2f', annot_kws={"size":12}, ax=ax,
                              vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask) #annot=True, fmt='.2f', annot_kws={"size": 8, "mask": mask})
            ax.set_ylabel('')
            ax.set_xlabel('')
            ax.tick_params(axis='y',labelsize=12)
            ax.tick_params(axis='x',labelsize=12)
        
            if i == 0:
                ax.set_title(value_names[j], fontsize=15)  # Set column titles

cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label

# row titles
for i, ax in enumerate(axes):
    ax[0].set_ylabel(im_user_inputs[i], fontsize=15)
for i, ax in enumerate(axes):
    ax[2].set_ylabel(cc_user_inputs[i], fontsize=15)
                
# Set section headers above the middle columns (0 and 1 and 2 and 3)
fig.text(0.25, 1, 'Immigation', fontsize=15, ha='center')
fig.text(0.75, 1, 'Climate Change', fontsize=15, ha='center')

plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/sourcesim_hist_matrix_irbo_content.eps', format='eps', bbox_inches='tight')
plt.show()

## Query selection
Compare the query selection amongst each other, per user input group. Sample = no search history.

### Source similarity

In [None]:
im_source_sim = pd.merge(im_source_sim, im_html[['run_uid', 'test_num']], left_on='run_uid1', right_on='run_uid', how='left')
im_source_sim.rename(columns={'test_num':'test_num1'}, inplace=True)
im_source_sim = pd.merge(im_source_sim, im_html[['run_uid', 'test_num']], left_on='run_uid2', right_on='run_uid', how='left')
im_source_sim.rename(columns={'test_num':'test_num2'}, inplace=True)

In [None]:
im_source_sim.head()

In [None]:
cc_source_sim = pd.merge(cc_source_sim, cc_html[['run_uid', 'test_num']], left_on='run_uid1', right_on='run_uid', how='left')
cc_source_sim.rename(columns={'test_num':'test_num1'}, inplace=True)
cc_source_sim = pd.merge(cc_source_sim, cc_html[['run_uid', 'test_num']], left_on='run_uid2', right_on='run_uid', how='left')
cc_source_sim.rename(columns={'test_num':'test_num2'}, inplace=True)

In [None]:
cc_source_sim.head()

In [None]:
# only comparisons of agents without browser history (clean slates)
im_source_sim_none = im_source_sim[(im_source_sim['search_history1']=='none')&(im_source_sim['search_history2']=='none')]
cc_source_sim_none = cc_source_sim[(cc_source_sim['search_history1']=='none')&(cc_source_sim['search_history2']=='none')]
print(im_source_sim_none.run_uid1.nunique(), cc_source_sim_none.run_uid1.nunique())

In [None]:
issues = ['Immigration', 'Climate change']
#im_user_inputs = ['pro', 'neutral', 'anti']
im_user_inputs = ['anti', 'neutral', 'pro']
#cc_user_inputs = ['high', 'neutral', 'low']
cc_user_inputs = ['low', 'neutral', 'high']
values = ['irbo0.95_value', 'irbo0.8_value']
value_names = ['IRBO (p = 0.95)', 'IRBO (p = 0.8)']

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(15, 15))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 

for issue_index, issue in zip([0,2], issues):
    if issue == 'Immigration':
        df = im_source_sim_none
        user_input_values = im_user_inputs
        #sh_order = ['anti', 'neutral', 'pro', 'mixed', 'unrel.', 'none']
    elif issue == 'Climate change':
        df = cc_source_sim_none
        user_input_values = cc_user_inputs
        #sh_order = ['low', 'neutral', 'high', 'mixed', 'unrel.', 'none']

    # rows
    for i, user_input in enumerate(user_input_values):
        df_filtered = df[(df['user_input1']==user_input)&(df['user_input2']==user_input)]

        # columns
        for j, value in enumerate(values):
            column_index = j + issue_index
            #print(issue, value, column_index, user_input, i)
            ax = axes[i, column_index]
            
            source_history = df_filtered.groupby(['test_num1', 'test_num2'])[value].mean().unstack()
            #source_history = source_history.reindex(index=sh_order, columns=sh_order)
            mask = np.triu(np.ones_like(source_history, dtype=bool), k=1) # masking upper triangle, excl diagonal
            plot = sns.heatmap(source_history, cmap=cmap, cbar=True, annot=True, fmt='.2f', ax=ax,
                              vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask)
            ax.set_ylabel('')
            ax.set_xlabel('')
        
            if i == 0:
                ax.set_title(value_names[j])  # Set column titles

cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label

# row titles
for i, ax in enumerate(axes):
    ax[0].set_ylabel(im_user_inputs[i], fontsize=14)
for i, ax in enumerate(axes):
    ax[2].set_ylabel(cc_user_inputs[i], fontsize=14)
                
# Set section headers above the middle columns (0 and 1 and 2 and 3)
fig.text(0.25, 1, 'Immigation', fontsize=14, ha='center')
fig.text(0.75, 1, 'Climate Change', fontsize=14, ha='center')

plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/sourcesim_query_matrix.eps', format='eps', bbox_inches='tight')
plt.show()

### Content similarity

In [None]:
im_content_sim = pd.merge(im_content_sim, im_html[['run_uid', 'test_num']], left_on='run_uid1', right_on='run_uid', how='left')
im_content_sim.rename(columns={'test_num':'test_num1'}, inplace=True)
im_content_sim = pd.merge(im_content_sim, im_html[['run_uid', 'test_num']], left_on='run_uid2', right_on='run_uid', how='left')
im_content_sim.rename(columns={'test_num':'test_num2'}, inplace=True)

In [None]:
im_content_sim.head()

In [None]:
cc_content_sim = pd.merge(cc_content_sim, cc_html[['run_uid', 'test_num']], left_on='run_uid1', right_on='run_uid', how='left')
cc_content_sim.rename(columns={'test_num':'test_num1'}, inplace=True)
cc_content_sim = pd.merge(cc_content_sim, cc_html[['run_uid', 'test_num']], left_on='run_uid2', right_on='run_uid', how='left')
cc_content_sim.rename(columns={'test_num':'test_num2'}, inplace=True)

In [None]:
cc_content_sim.head()

In [None]:
# only comparisons of agents without browser history (clean slates)
im_content_sim_none = im_content_sim[(im_content_sim['search_history1']=='none')&(im_content_sim['search_history2']=='none')]
cc_content_sim_none = cc_content_sim[(cc_content_sim['search_history1']=='none')&(cc_content_sim['search_history2']=='none')]
print(im_content_sim_none.run_uid1.nunique(), cc_content_sim_none.run_uid1.nunique())

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10, 15))
cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 

for j, issue in enumerate(issues):
    if issue == 'Immigration':
        df = im_content_sim_none.copy()
        user_input_values = im_user_inputs
        order = ["anti", "neutral", "pro"]
    elif issue == 'Climate change':
        df = cc_content_sim_none.copy()
        user_input_values = cc_user_inputs
        order = ["low", "neutral", "high"]

    for i, user_input in enumerate(user_input_values):
        ax = axes[i, j] 
        
        df_filtered = df[(df['user_input1']==user_input)&(df['user_input2']==user_input)]
        content_history = df_filtered.groupby(['test_num1', 'test_num2'])['icos_value'].mean().unstack()
        
        mask = np.triu(np.ones_like(content_history, dtype=bool), k=1) # masking upper triangle, excl diagonal
        plot = sns.heatmap(content_history, cmap=cmap, cbar=True, annot=True, fmt='.2f', ax=ax,
                       vmin=0, vmax=1, cbar_ax = cbar_ax, mask=mask)
    
        ax.set_ylabel('')
        ax.set_xlabel('')
        ax.set_ylabel(user_input, fontsize=14)
        if i == 0:
            ax.set_title(issue, fontsize=14)  # Set column titles
            
        
cbar = plt.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
cbar.set_label('', fontsize=14)  # Set the color bar label
plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/contentsim_query_matrix.eps', format='eps', bbox_inches='tight')
plt.show()

## Query selection -- Content + User -- Dot plot
Compare the disimilarity among its own SERPs with the dissimilarity with other SERPS


In [None]:
values = ['irbo0.8_value', 'icos_value']
results_df = []
for user_input in im_user_inputs:
    #print(user_input)
    for value in values:
        #print(value)
        if value == 'irbo0.8_value':
            df = im_source_sim_none.copy()
        elif value == 'icos_value':
            df = im_content_sim_none.copy()
        df_filtered = df[(df['user_input1'] == user_input)&(df['user_input2'] == user_input)].copy()

        df_filtered.loc[(df_filtered['test_num1'] == df_filtered['test_num2']), 'group'] = 'Same query'
        df_filtered.loc[(df_filtered['test_num1'] != df_filtered['test_num2']), 'group'] = 'Different query'
        #print(df_filtered)
        query_selection = df_filtered.groupby('group')[value].mean().reset_index(name='value')
        query_selection['issue'] = 'Immigration'
        query_selection['value_type'] = value
        query_selection['user_input'] = user_input
    
        
        #print(query_selection)
        results_df.append(query_selection)
        
for user_input in cc_user_inputs:
    #print(user_input)
    for value in values:
        #print(value)
        if value == 'irbo0.8_value':
            df = cc_source_sim_none.copy()
        elif value == 'icos_value':
            df = cc_content_sim_none.copy()
    
        df_filtered = df[(df['user_input1'] == user_input) & (df['user_input2'] == user_input)].copy()
        
        df_filtered.loc[(df_filtered['test_num1'] == df_filtered['test_num2']), 'group'] = 'Same query'
        df_filtered.loc[(df_filtered['test_num1'] != df_filtered['test_num2']), 'group'] = 'Different query'
        #print(df_filtered)
        query_selection = df_filtered.groupby('group')[value].mean().reset_index(name='value')
        query_selection['issue'] = 'Climate Change'
        query_selection['value_type'] = value
        query_selection['user_input'] = user_input
    
        
        print(query_selection)
        results_df.append(query_selection)

In [None]:
results_df = pd.concat(results_df)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(15, 5), sharey=True)
#cbar_ax = fig.add_axes([0.15, 0.07, 0.7, 0.02]) 
value_names = ['IRBO (p = 0.8)', 'ICS']
issues = ['Immigration', 'Climate Change']
for issue_index, issue in zip([0,2], issues):

    # columns
    for j, value in enumerate(values):
        column_index = j + issue_index
        ax = axes[column_index]
        
        df = results_df[(results_df['value_type']==value)&(results_df['issue']==issue)]
        plot = sns.pointplot(data=df, x = 'user_input', y='value', hue='group', kind='point', ci=None, join=False, ax=ax, legend=False)
        ax.legend_.remove()
        ax.set_ylabel('')
        ax.set_xlabel('')
        ax.tick_params(axis='y',labelsize=12)
        ax.tick_params(axis='x',labelsize=12)
        #ax.set_title(value_names[i], fontsize=14)
        ax.set_title(value_names[j], fontsize=15)
        ax.set_ylim(0, 1)
        if column_index == 0:
            handles, labels = ax.get_legend_handles_labels()
        
# Set section headers above the middle columns (0 and 1 and 2 and 3)
fig.text(0.25, 1, 'Immigation', fontsize=15, ha='center')
fig.text(0.75, 1, 'Climate Change', fontsize=15, ha='center')
fig.legend(handles, labels, title = 'Query selection', loc='upper center', bbox_to_anchor=(0.5, 0.05), ncol=2, fontsize=12, title_fontsize=12)

plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust the layout to accommodate the color bar
plt.savefig('sim_figures/query_dotplot.eps', format='eps', bbox_inches='tight')
plt.show()
