# Exploratory Data Analysis

**Section 2, Appendix B:** language, country, prompt length, turn, intent distribution of search arena prompts.

In [None]:
import numpy as np
import pandas as pd
import pycountry

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import shutil
from datasets import load_dataset
from huggingface_hub import hf_hub_download

### Loading Datasets

In [None]:
search_arena = pd.read_json("../data/search_arena_24k.jsonl", orient='records', lines=True)
print(f"Number of rows: {len(search_arena)}")
search_arena.head()

In [None]:
simpleqa = pd.read_csv("https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv")
print(f"Number of rows: {len(simpleqa)}")
simpleqa.head()

In [None]:
import base64
import hashlib

def derive_key(password: str, length: int) -> bytes:
    """Derive a fixed-length key from the password using SHA256."""
    hasher = hashlib.sha256()
    hasher.update(password.encode())
    key = hasher.digest()
    return key * (length // len(key)) + key[: length % len(key)]

def decrypt(ciphertext_b64: str, password: str) -> str:
    """Decrypt base64-encoded ciphertext with XOR."""
    encrypted = base64.b64decode(ciphertext_b64)
    key = derive_key(password, len(encrypted))
    decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
    return decrypted.decode()

browsecomp = pd.read_csv("https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv")
for index in browsecomp.index:
    row = browsecomp.iloc[index]
    browsecomp.loc[index, 'problem'] = decrypt(row.get("problem", ""), row.get("canary", ""))
    browsecomp.loc[index, 'answer'] = decrypt(row.get("answer", ""), row.get("canary", ""))
print(f"Number of rows: {len(browsecomp)}")
browsecomp.head()

In [None]:
path = hf_hub_download(repo_id="ariya2357/CORAL", filename="test/new_test_conversation.json", repo_type="dataset", local_dir="../data/")
shutil.move(path, "../data/coral_test.json")
shutil.rmtree("../data/test/")

path = hf_hub_download(repo_id="ariya2357/CORAL", filename="train/new_train_conversation.json", repo_type="dataset", local_dir="../data/")
shutil.move(path, "../data/coral_train.json")
shutil.rmtree("../data/train/")

coral_train = pd.read_json("../data/coral_train.json")
coral_test = pd.read_json("../data/coral_test.json")
coral = pd.concat([coral_train, coral_test])
coral["turn"] = coral["turns"].str.len()
coral.head()

In [None]:
wildchat = load_dataset("allenai/WildChat-4.8M", split="train", cache_dir="/data/mmiroyan/search-arena/hf_data/")
wildchat = wildchat.select_columns(["conversation", "language", "model", "turn"])
wildchat = wildchat.shuffle(seed=42)
wildchat = wildchat.select(range(100000))
wildchat = wildchat.to_pandas()
wildchat.head()

### Language distribution

In [None]:
search_arena['languages'].apply(lambda x: x[0] if len(x) > 0 else None).value_counts(normalize=True)

In [None]:
languages = search_arena['languages'].apply(lambda x: x[0] if len(x) > 0 else None)
language_counts = languages.value_counts().reset_index()
language_counts.columns = ['Language', 'Count']
language_counts['LogCount'] = np.log10(language_counts['Count'])
total_count = language_counts['Count'].sum()
language_counts = language_counts.sort_values(by='Count', ascending=True).iloc[-15:]

language_counts['Percentage'] = (language_counts['Count'] / total_count) * 100
language_counts['Label'] = language_counts.apply(
    lambda row: f"{row['Percentage']:.1f}%", axis=1
)

nice_blue = "#268bd2"

fig = px.bar(
    language_counts,
    x='Count',
    y='Language',
    orientation='h',
    text=language_counts['Label'],
    color_discrete_sequence=[nice_blue],
    opacity=0.85
)

fig.update_traces(
    textposition='outside',
    textfont=dict(size=15),
    textangle=0,
    cliponaxis=False,
)

min_tick = int(np.floor(np.log10(language_counts['Count'].min())))
max_tick = int(np.ceil(np.log10(language_counts['Count'].max())))
tick_vals = [10**i for i in range(min_tick, max_tick + 1)]

fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis_title=dict(
        text='Count (log scale)',
        font=dict(size=18)
    ),
    yaxis_title="",
    xaxis_type='log',
    xaxis=dict(
        tickvals=[100, 1000, 10000],
        ticktext=["100", "1000", "10000"],
        range=[1.8, np.log10(24000)],
        tickfont=dict(size=16, color='black'),
        showgrid=True,
        gridcolor='lightgray',
        zeroline=False,
        linecolor='black',
        linewidth=1.0,
    ),
    yaxis=dict(
        tickfont=dict(size=18, color='black'),
        showgrid=False,
        zeroline=False,
        linecolor='black',
        linewidth=1.0,
    ),
    font=dict(size=14, color='black'),
    width=800,
    height=500,
    margin=dict(l=0, r=20, t=20, b=60)
)

fig.show()
fig.write_image('plots/language_dist.pdf')

### Country distribution

In [None]:
country_info = search_arena['system_a_metadata'].apply(lambda x: x.get('client_country') if isinstance(x, dict) else None)
country_stats = country_info.value_counts().reset_index()
country_stats.columns = ['country_alpha2', 'count']

def alpha2_to_alpha3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except:
        return None
    
country_stats['country'] = country_stats['country_alpha2'].apply(alpha2_to_alpha3)
country_stats = country_stats.dropna(subset=['country'])
num_countries = country_stats['country'].nunique()
print(f"Number of unique countries: {num_countries}")

total_count = country_stats['count'].sum()
country_stats['log_count'] = np.log10(country_stats['count'])

fig = px.choropleth(
    country_stats,
    locations="country",
    locationmode="ISO-3",
    color=country_stats['log_count'],
    color_continuous_scale="YlGnBu",
    labels={'log_count': 'log₁₀(#Convs)'},
    hover_data=["count"]
)

fig.update_layout(
    width=1000,
    height=470,
    margin={"r":10,"t":10,"l":10,"b":10},
    coloraxis_colorbar=dict(
        title=dict(text="#Convs", font=dict(size=12)),
        tickvals=[0, 1, 2, 3, 4],
        ticktext=["1", "10", "10²", "10³", "10⁴"],
        tickfont=dict(size=10),
        len=0.95,
        yanchor="middle",
        y=0.5,
        xanchor="left",
    )
)

fig.show()
fig.write_image("plots/country_dist.pdf")

### Prompt length distribution

In [None]:
def get_lengths(prompt):
    return len(prompt.split())

colors = {
    'Search Arena': '#268bd2',
    'SimpleQA': '#2ca02c',
    'BrowseComp': '#9467bd'
}

sa_lengths = [get_lengths(prompt) for prompt in list(search_arena['messages_a'].apply(lambda x: x[0]['content']))]
bc_lengths = [get_lengths(prompt) for prompt in list(browsecomp['problem'])]
sq_lengths = [get_lengths(prompt) for prompt in list(simpleqa['problem'])]

plt.figure(figsize=(7, 5))
sns.set_theme(
    style="whitegrid",
)

max_x = np.percentile(sa_lengths + bc_lengths + sq_lengths, 99.5)
use_smoothing = True
show_stats = True

if use_smoothing:
    sns.kdeplot(sa_lengths, label='Search Arena', color=colors['Search Arena'], fill=True, alpha=0.25, linewidth=2, clip=(0, max_x))
    sns.kdeplot(bc_lengths, label='Browse Comp', color=colors['BrowseComp'], fill=True, alpha=0.25, linewidth=2, clip=(0, max_x))
    sns.kdeplot(sq_lengths, label='SimpleQA', color=colors['SimpleQA'], fill=True, alpha=0.25, linewidth=2, clip=(0, max_x))
else:
    plt.hist(sa_lengths, bins=50, label='Search Arena', color=colors['Search Arena'], alpha=0.25, density=True)
    plt.hist(bc_lengths, bins=50, label='Browse Comp', color=colors['BrowseComp'], alpha=0.25, density=True)
    plt.hist(sq_lengths, bins=50, label='SimpleQA', color=colors['SimpleQA'], alpha=0.25, density=True)

if show_stats:
    for lengths, color, name in [(sa_lengths, colors['Search Arena'], 'Search Arena'),
                                (bc_lengths, colors['BrowseComp'], 'Browse Comp'),
                                (sq_lengths, colors['SimpleQA'], 'SimpleQA')]:
        mean = np.mean(lengths)
        median = np.median(lengths)

        plt.axvline(mean, color=color, linestyle='--', linewidth=2, alpha=0.8,
                   label=f'{name} mean: {mean:.1f}')

plt.yscale('log')
plt.xlim(1, max_x)
plt.xlabel('Prompt Length (words)', fontsize=16, labelpad=10, color='black')
plt.xticks(fontsize=12)
plt.ylabel('Density', fontsize=16, labelpad=10, color='black')
plt.yticks(fontsize=12)
plt.legend(fontsize=12, frameon=True, fancybox=True, framealpha=0.9, facecolor='white', edgecolor='black', labelcolor='black')
plt.tight_layout()
plt.savefig('plots/prompt_length_dist.pdf', dpi=300, bbox_inches='tight')
plt.show()

def stats(name, arr):
    print(f"{name:12} | Mean: {np.mean(arr):7.2f} | Std: {np.std(arr):7.2f} | Min: {np.min(arr):4d} | Max: {np.max(arr):5d}")

print("Prompt Length Statistics")
print("-" * 60)
stats("Search Arena", sa_lengths)
stats("Browse Comp", bc_lengths)
stats("SimpleQA", sq_lengths)

In [None]:
def get_lengths(prompt):
    return len(prompt.split())

colors = {
    'Search Arena': '#268bd2',
    'CORAL': '#fe9194'
}

sa_lengths = [get_lengths(prompt) for prompt in list(search_arena['messages_a'].apply(lambda x: x[0]['content']))]
coral_lengths = [get_lengths(prompt) for prompt in list(coral["turns"].apply(lambda x: x[0]['question']))]

plt.figure(figsize=(7, 5))
sns.set_theme(
    style="whitegrid",
)

max_x = np.percentile(sa_lengths + coral_lengths, 99.5)
use_smoothing = True
show_stats = True

if use_smoothing:
    sns.kdeplot(sa_lengths, label='Search Arena', color=colors['Search Arena'], fill=True, alpha=0.25, linewidth=2, clip=(0, max_x))
    sns.kdeplot(coral_lengths, label='CORAL', color=colors['CORAL'], fill=True, alpha=0.25, linewidth=2, clip=(0, max_x))
else:
    plt.hist(sa_lengths, bins=50, label='Search Arena', color=colors['Search Arena'], alpha=0.25, density=True)
    plt.hist(coral_lengths, bins=50, label='CORAL', color=colors['CORAL'], alpha=0.25, density=True)

if show_stats:
    for lengths, color, name in [(sa_lengths, colors['Search Arena'], 'Search Arena'),
                                (coral_lengths, colors['CORAL'], 'CORAL')]:
        mean = np.mean(lengths)
        median = np.median(lengths)

        plt.axvline(mean, color=color, linestyle='--', linewidth=2, alpha=0.8,
                   label=f'{name} mean: {mean:.1f}')

plt.yscale('log')
plt.xlim(1, max_x)
plt.xlabel('Prompt Length (words)', fontsize=16, labelpad=10, color='black')
plt.xticks(fontsize=12)
plt.ylabel('Density', fontsize=16, labelpad=10, color='black')
plt.yticks(fontsize=12)
plt.legend(fontsize=12, frameon=True, fancybox=True, framealpha=0.9, facecolor='white', edgecolor='black', labelcolor='black')
plt.tight_layout()
plt.savefig('plots/coral_prompt_length_dist.pdf', dpi=300, bbox_inches='tight')
plt.show()

def stats(name, arr):
    print(f"{name:12} | Mean: {np.mean(arr):7.2f} | Std: {np.std(arr):7.2f} | Min: {np.min(arr):4d} | Max: {np.max(arr):5d}")

print("Prompt Length Statistics")
print("-" * 60)
stats("Search Arena", sa_lengths)
stats("CORAL", coral_lengths)

In [None]:
def get_lengths(prompt):
    return len(prompt.split())

colors = {
    'Search Arena': '#268bd2',
    'WildChat': '#f0529c'
}

sa_lengths = [get_lengths(prompt) for prompt in list(search_arena['messages_a'].apply(lambda x: x[0]['content']))]
wildchat_lengths = [get_lengths(prompt) for prompt in list(wildchat["conversation"].apply(lambda x: x[0]['content']))]

plt.figure(figsize=(7, 5))
sns.set_theme(
    style="whitegrid",
)

max_x = np.percentile(sa_lengths + coral_lengths, 99.5)
use_smoothing = True
show_stats = True

if use_smoothing:
    sns.kdeplot(sa_lengths, label='Search Arena', color=colors['Search Arena'], fill=True, alpha=0.25, linewidth=2, clip=(0, max_x))
    sns.kdeplot(wildchat_lengths, label='WildChat', color=colors['WildChat'], fill=True, alpha=0.25, linewidth=2, clip=(0, max_x))
else:
    plt.hist(sa_lengths, bins=50, label='Search Arena', color=colors['Search Arena'], alpha=0.25, density=True)
    plt.hist(wildchat_lengths, bins=50, label='WildChat', color=colors['WildChat'], alpha=0.25, density=True)
if show_stats:
    for lengths, color, name in [(sa_lengths, colors['Search Arena'], 'Search Arena'),
                                (wildchat_lengths, colors['WildChat'], 'WildChat')]:
        mean = np.mean(lengths)
        median = np.median(lengths)

        plt.axvline(mean, color=color, linestyle='--', linewidth=2, alpha=0.8,
                   label=f'{name} mean: {mean:.1f}')

plt.yscale('log')
plt.xlim(1, max_x)
plt.xlabel('Prompt Length (words)', fontsize=16, labelpad=10, color='black')
plt.xticks(fontsize=12)
plt.ylabel('Density', fontsize=16, labelpad=10, color='black')
plt.yticks(fontsize=12)
plt.legend(fontsize=12, frameon=True, fancybox=True, framealpha=0.9, facecolor='white', edgecolor='black', labelcolor='black')
plt.tight_layout()
plt.savefig('plots/wildchat_prompt_length_dist.pdf', dpi=300, bbox_inches='tight')
plt.show()

def stats(name, arr):
    print(f"{name:12} | Mean: {np.mean(arr):7.2f} | Std: {np.std(arr):7.2f} | Min: {np.min(arr):4d} | Max: {np.max(arr):5d}")

print("Prompt Length Statistics")
print("-" * 60)
stats("Search Arena", sa_lengths)
stats("WildChat", wildchat_lengths)

In [None]:
UPPER_PERCENTILE = 0.99
LOWER_PERCENTILE = 0.01

intent_order = [
    "Factual Lookup", "Info Synthesis", "Analysis", "Recommendation",
    "Explanation", "Creative Generation", "Guidance", "Text Processing"
]
custom_colors = [
    "#bebada", "#ffffb3", "#8dd3c7", "#80b1d3",
    "#fb8072", "#fdb462", "#b3de69", "#e78ac3"
]
intent_color_map = dict(zip(intent_order, custom_colors))

search_arena['prompt_length'] = search_arena['messages_a'].apply(lambda x: len(x[0]['content'].split()))

# Filter out outliers
filtered = search_arena[
    (search_arena['prompt_length'] > search_arena['prompt_length'].quantile(LOWER_PERCENTILE)) &
    (search_arena['prompt_length'] < search_arena['prompt_length'].quantile(UPPER_PERCENTILE))
].copy()

filtered = filtered[filtered['primary_intent'].isin(intent_order)]

df = filtered[['prompt_length', 'primary_intent']].rename(columns={'prompt_length': 'length'})

intent_means = df.groupby('primary_intent')['length'].mean().sort_values(ascending=False)
display(intent_means)
sorted_intents = intent_means.index.tolist()

fig = px.box(
    df,
    x='primary_intent',
    y='length',
    log_y=True,
    points=False,
    color='primary_intent',
    category_orders={'primary_intent': sorted_intents},
    color_discrete_map=intent_color_map
)

for trace in fig.data:
    trace.showlegend = False

fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis_title='',
    yaxis=dict(
        type='log',
        tickfont=dict(size=16, color='black'),
        tickvals=[1, 10, 100, 1000],
        ticktext=['1', '10', '100', '1000'],
        showgrid=True,
        gridcolor='lightgray',
        zeroline=False,
        linecolor='black',
        linewidth=1.0,
        title=dict(text='Log(num words)', font=dict(size=18, color='black')),
    ),
    xaxis=dict(
        tickfont=dict(size=14, color='black'),
        showgrid=False,
        linecolor='black',
        linewidth=1.0,
    ),
    font=dict(size=13, color='black'),
    width=800,
    height=600,
    margin=dict(l=60, r=10, t=40, b=80)
)

fig.show()
fig.write_image("plots/prompt_length_dist_intent.pdf")

### Num turn distribution

In [None]:
nice_blue = "#268bd2"

# Step 1: Bin the 'turn' column
def bin_turns(turn):
    if turn > 6:
        return "6+"
    else:
        return str(turn)

turn_bins = search_arena['turn'].apply(bin_turns)

# Step 2: Count frequencies
turn_counts = turn_bins.value_counts().reset_index()
turn_counts.columns = ['Turns', 'Count']
turn_counts['Turns'] = pd.Categorical(turn_counts['Turns'], categories=[str(i) for i in range(1, 7)] + ['6+'], ordered=True)
turn_counts = turn_counts.sort_values('Turns')
display(turn_counts)

# Step 3: Compute percentages
total = turn_counts['Count'].sum()
turn_counts['Percentage'] = (turn_counts['Count'] / total) * 100
turn_counts['Label'] = turn_counts['Percentage'].apply(lambda x: f"{x:.1f}%")

# Step 4: Plot
fig = px.bar(
    turn_counts,
    x='Turns',
    y='Count',
    text=turn_counts['Label'],
    color_discrete_sequence=[nice_blue],
    opacity=0.85
)

fig.update_traces(
    textposition='outside',
    textfont=dict(size=14),
    cliponaxis=False,
)

# Step 5: Layout styling
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis_title=dict(
        text='Turns per Conversation',
        font=dict(size=18)
    ),
    yaxis_title=dict(
        text='Count (log scale)',
        font=dict(size=18)
    ),
    xaxis=dict(
        tickfont=dict(size=16, color='black'),
        showgrid=False,
        linecolor='black',
        linewidth=1.0
    ),
    yaxis=dict(
        type='log',
        tickvals=[10, 100, 1000, 10000],
        ticktext=["10", "100", "1k", "10k"],
        tickfont=dict(size=16, color='black'),
        showgrid=True,
        gridcolor='lightgray',
        zeroline=False,
        linecolor='black',
        linewidth=1.0
    ),
    font=dict(size=14, color='black'),
    width=800,
    height=500,
    margin=dict(l=40, r=20, t=30, b=60)
)

fig.show()
fig.write_image('plots/turn_distribution.pdf')

In [None]:
color = "#fe9194"

# Step 1: Bin the 'turn' column
def bin_turns(turn):
    if turn > 6:
        return "6+"
    else:
        return str(turn)

turn_bins = coral['turn'].apply(bin_turns)

# Step 2: Count frequencies
turn_counts = turn_bins.value_counts().reset_index()
turn_counts.columns = ['Turns', 'Count']
turn_counts['Turns'] = pd.Categorical(turn_counts['Turns'], categories=[str(i) for i in range(1, 7)] + ['6+'], ordered=True)
turn_counts = turn_counts.sort_values('Turns')
display(turn_counts)

# Step 3: Compute percentages
total = turn_counts['Count'].sum()
turn_counts['Percentage'] = (turn_counts['Count'] / total) * 100
turn_counts['Label'] = turn_counts['Percentage'].apply(lambda x: f"{x:.1f}%")

# Step 4: Plot
fig = px.bar(
    turn_counts,
    x='Turns',
    y='Count',
    text=turn_counts['Label'],
    color_discrete_sequence=[color],
    opacity=0.85
)

fig.update_traces(
    textposition='outside',
    textfont=dict(size=14),
    cliponaxis=False,
)

# Step 5: Layout styling
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis_title=dict(
        text='Turns per Conversation',
        font=dict(size=18)
    ),
    yaxis_title=dict(
        text='Count (log scale)',
        font=dict(size=18)
    ),
    xaxis=dict(
        tickfont=dict(size=16, color='black'),
        showgrid=False,
        linecolor='black',
        linewidth=1.0
    ),
    yaxis=dict(
        type='log',
        tickvals=[10, 100, 1000, 10000],
        ticktext=["10", "100", "1k", "10k"],
        tickfont=dict(size=16, color='black'),
        showgrid=True,
        gridcolor='lightgray',
        zeroline=False,
        linecolor='black',
        linewidth=1.0
    ),
    font=dict(size=14, color='black'),
    width=800,
    height=500,
    margin=dict(l=40, r=20, t=30, b=60)
)

fig.show()
fig.write_image('plots/coral_turn_distribution.pdf')

In [None]:
color = "#f0529c"

# Step 1: Bin the 'turn' column
def bin_turns(turn):
    if turn > 6:
        return "6+"
    else:
        return str(turn)

turn_bins = wildchat['turn'].apply(bin_turns)

# Step 2: Count frequencies
turn_counts = turn_bins.value_counts().reset_index()
turn_counts.columns = ['Turns', 'Count']
turn_counts['Turns'] = pd.Categorical(turn_counts['Turns'], categories=[str(i) for i in range(1, 7)] + ['6+'], ordered=True)
turn_counts = turn_counts.sort_values('Turns')
display(turn_counts)

# Step 3: Compute percentages
total = turn_counts['Count'].sum()
turn_counts['Percentage'] = (turn_counts['Count'] / total) * 100
turn_counts['Label'] = turn_counts['Percentage'].apply(lambda x: f"{x:.1f}%")

# Step 4: Plot
fig = px.bar(
    turn_counts,
    x='Turns',
    y='Count',
    text=turn_counts['Label'],
    color_discrete_sequence=[color],
    opacity=0.85
)

fig.update_traces(
    textposition='outside',
    textfont=dict(size=14),
    cliponaxis=False,
)

# Step 5: Layout styling
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis_title=dict(
        text='Turns per Conversation',
        font=dict(size=18)
    ),
    yaxis_title=dict(
        text='Count (log scale)',
        font=dict(size=18)
    ),
    xaxis=dict(
        tickfont=dict(size=16, color='black'),
        showgrid=False,
        linecolor='black',
        linewidth=1.0
    ),
    yaxis=dict(
        type='log',
        tickvals=[10, 100, 1000, 10000],
        ticktext=["10", "100", "1k", "10k"],
        tickfont=dict(size=16, color='black'),
        showgrid=True,
        gridcolor='lightgray',
        zeroline=False,
        linecolor='black',
        linewidth=1.0
    ),
    font=dict(size=14, color='black'),
    width=800,
    height=500,
    margin=dict(l=40, r=20, t=30, b=60)
)

fig.show()
fig.write_image('plots/wildchat_turn_distribution.pdf')

### Intent distribution

In [None]:
intent_counts = search_arena['primary_intent'].value_counts()
intent_percent = (intent_counts / intent_counts.sum() * 100).round(1).drop('Other')
intent_counts = intent_counts.drop('Other')

df_plot = pd.DataFrame({
    'Intent': intent_counts.index,
    'Count': intent_counts.values,
    'Percent': intent_percent.values
})

custom_colors = [
    "#bebada",
    "#ffffb3",
    "#8dd3c7",
    "#80b1d3",
    "#fb8072",
    "#fdb462",
    "#b3de69",
    "#e78ac3",
]

fig = px.bar(
    df_plot,
    x='Intent',
    y='Count',
    color='Intent',
    text=df_plot['Percent'].astype(str) + '%',
    color_discrete_sequence=custom_colors
)

fig.update_traces(
    textposition='outside',
    marker_line_color='black',
    marker_line_width=1.5,
    textfont=dict(size=20, color='black')
)

fig.update_layout(
    plot_bgcolor='white',
    title=None,
    xaxis_title='',
    xaxis=dict(
        tickfont=dict(size=20, color='black'),
        tickangle=-45
    ),
    yaxis=dict(
        title=dict(text='Count', font=dict(size=22, color='black')),
        tickfont=dict(size=14, color='black'),
        tickvals=list(range(0, 4500, 1000))
    ),
    showlegend=False,
    margin=dict(t=10, l=20, r=20, b=0),
    width=700,
    height=800
)

fig.show()
fig.write_image('plots/intent_dist.pdf')