In [20]:
import pandas as pd
import numpy as np
import altair as alt
from datetime import datetime

alt.themes.enable('default')
alt.data_transformers.disable_max_rows()

alt.themes.register('professional', lambda: {
    'config': {
        'view': {'continuousWidth': 600, 'continuousHeight': 400},
        'mark': {'tooltip': True},
        'axis': {
            'labelFontSize': 12,
            'titleFontSize': 14,
            'grid': True,
            'gridOpacity': 0.3
        },
        'legend': {
            'labelFontSize': 12,
            'titleFontSize': 14
        },
        'title': {
            'fontSize': 16,
            'anchor': 'start'
        }
    }
})
alt.themes.enable('professional')

print("Setup complete. Altair configured for professional visualizations.")

Setup complete. Altair configured for professional visualizations.


In [2]:
df = pd.read_csv('dataset_doc_quant_playground.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\nColumn names and types:")
print(df.dtypes)
print("\nFirst few rows:")
df.head()

Dataset shape: (60, 14)
Number of rows: 60
Number of columns: 14

Column names and types:
title                                                                        object
authors                                                                      object
year                                                                          int64
prior_user_research                                                          object
If study was used, type of study used                                        object
Unnamed: 5                                                                  float64
audiences                                                                    object
stakeholders_mentioned                                                       object
stakeholders_involved                                                        object
What is the precise context of use of the tool as presented by authors\n     object
tool_evaluation                                                       

Unnamed: 0,title,authors,year,prior_user_research,"If study was used, type of study used",Unnamed: 5,audiences,stakeholders_mentioned,stakeholders_involved,What is the precise context of use of the tool as presented by authors\n,tool_evaluation,degree_automation,tool_integration,tool_description
0,Data Statements | Tech Policy Lab,"McMillan-Major, Angelina; Bender, Emily M.",2023,No,This paper is about Data Statements version 3\...,,"dataset creators, dataset users",Yes,No,linguistic contexts primarily but could be bro...,No,Manual,No,toolkit
1,The Dataset Nutrition Label: A Framework To Dr...,"Holland, Sarah; Hosny, Ahmed; Newman, Sarah; J...",2018,Yes,I am saying yes even though the study was not ...,,dataset creators,Yes,Yes,Their intention overall is to be context-indep...,No,Hybrid,No,framework
2,Towards accountability for machine learning da...,"Hutchinson, Ben; Smart, Andrew; Hanna, Alex; D...",2021,No,,,dataset creators,Yes,No,"Artificial Ingeligence, \nMachine Learning dat...",No,Manual,No,framework
3,Understanding machine learning practitioners' ...,"Heger, Amy K.; Marquis, Liz B.; Vorvoreanu, Mi...",2022,,This is a n empirical study of existing datase...,,,No,No,,,,,
4,A generative benchmark creation framework for ...,"Fox, Daniel C.; Khatiwada, Aamod; Shraga, Roee",2024,No,,,dataset creators,No,No,"No precise context is described, there are onl...",No,Automated,No,framework


In [3]:
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print("Missing Data Summary:")
print(missing_summary)
print(f"\nTotal rows in dataset: {len(df)}")

# Clean copy for analysis
df_clean = df.copy()

# Parse audiences column comma-separated values
def parse_audiences(audiences_str):
    if pd.isna(audiences_str):
        return []
    return [aud.strip() for aud in str(audiences_str).split(',') if aud.strip()]

df_clean['audiences_parsed'] = df_clean['audiences'].apply(parse_audiences)

audiences_flat = []
for idx, row in df_clean.iterrows():
    for audience in row['audiences_parsed']:
        audiences_flat.append({
            'index': idx,
            'audience': audience,
            'stakeholders_mentioned': row['stakeholders_mentioned'],
            'stakeholders_involved': row['stakeholders_involved'],
            'year': row['year']
        })

df_audiences = pd.DataFrame(audiences_flat)

print(f"\nUnique audience types found: {df_audiences['audience'].nunique()}")
print("\nAudience types:")
print(df_audiences['audience'].value_counts())

Missing Data Summary:
                                                                                               Column  \
Unnamed: 5                                                                                 Unnamed: 5   
If study was used, type of study used                           If study was used, type of study used   
tool_description                                                                     tool_description   
What is the precise context of use of the tool ...  What is the precise context of use of the tool...   
audiences                                                                                   audiences   
tool_integration                                                                     tool_integration   
tool_evaluation                                                                       tool_evaluation   
degree_automation                                                                   degree_automation   
authors                          

In [4]:
fig1_data = df_clean[['year', 'degree_automation', 'tool_description']].dropna()

fig1_agg = fig1_data.groupby(['year', 'degree_automation', 'tool_description']).size().reset_index(name='count')

print("Unique degree_automation values:", fig1_agg['degree_automation'].unique())

automation_colors = {
    'Manual': '#1f77b4',
    'Hybrid': '#ff7f0e',
    'Automated': '#2ca02c'
}

fig1_option_a = alt.Chart(fig1_agg).mark_bar().encode(
    x=alt.X('year:O', title='Year'),
    y=alt.Y('sum(count):Q', title='Number of Tools'),
    color=alt.Color('degree_automation:N', 
                    title='Degree of Automation',
                    scale=alt.Scale(domain=list(automation_colors.keys()), 
                                  range=list(automation_colors.values()))),
    column=alt.Column('tool_description:N', 
                      title='Tool Description',
                      header=alt.Header(titleOrient='bottom', labelOrient='bottom')),
    tooltip=['year:O', 'degree_automation:N', 'tool_description:N', 'count:Q']
).properties(
    width=150,
    height=400,
    title='Temporal Distribution of Degree of Automation by Tool Type'
)

fig1_option_a

Unique degree_automation values: ['Hybrid' 'Manual' 'Automated']


In [5]:
fig1_option_b = alt.Chart(fig1_agg).mark_bar().encode(
    x=alt.X('year:O', title='Year'),
    y=alt.Y('count:Q', title='Number of Tools'),
    color=alt.Color('degree_automation:N', 
                    title='Degree of Automation',
                    scale=alt.Scale(domain=list(automation_colors.keys()), 
                                  range=list(automation_colors.values()))),
    xOffset='degree_automation:N',
    facet=alt.Facet('tool_description:N', 
                    title='Tool Description',
                    columns=4),
    tooltip=['year:O', 'degree_automation:N', 'tool_description:N', 'count:Q']
).properties(
    width=200,
    height=250,
    title='Figure 1B: Temporal Distribution of Tool Automation Degree by Tool Type'
)

fig1_option_b

In [6]:
pivot_data = fig1_data.groupby(['year', 'degree_automation', 'tool_description']).size().reset_index(name='count')

fig1_option_c = alt.Chart(pivot_data).mark_rect().encode(
    x=alt.X('year:O', title='Year'),
    y=alt.Y('degree_automation:N', title='Degree of Automation'),
    color=alt.Color('count:Q', 
                    title='Count',
                    scale=alt.Scale(scheme='blues')),
    facet=alt.Facet('tool_description:N', 
                    title='Tool Description',
                    columns=3),
    tooltip=['year:O', 'degree_automation:N', 'tool_description:N', 'count:Q']
).properties(
    width=200,
    height=150,
    title='Figure 1C: Temporal Heatmap of Tool Automation Degree by Tool Type'
)

fig1_option_c

In [7]:
fig2_data = df_clean[['stakeholders_mentioned', 'stakeholders_involved', 'audiences']].dropna()
df_audiences_clean = df_audiences.dropna(subset=['stakeholders_mentioned', 'stakeholders_involved'])

stakeholder_data = []
for audience in df_audiences_clean['audience'].unique():
    audience_subset = df_audiences_clean[df_audiences_clean['audience'] == audience]
    
    mentioned_yes = len(audience_subset[audience_subset['stakeholders_mentioned'] == 'Yes'])
    
    involved_yes = len(audience_subset[audience_subset['stakeholders_involved'] == 'Yes'])
    
    stakeholder_data.append({
        'audience': audience,
        'category': 'Mentioned',
        'count': mentioned_yes
    })
    stakeholder_data.append({
        'audience': audience,
        'category': 'Involved',
        'count': involved_yes
    })

df_stakeholder = pd.DataFrame(stakeholder_data)

# Sort audiences by total count
audience_order = df_stakeholder.groupby('audience')['count'].sum().sort_values(ascending=False).index.tolist()

stakeholder_colors = {
    'Mentioned': '#4c78a8',
    'Involved': '#e45756'
}

fig2_option_a = alt.Chart(df_stakeholder).mark_bar().encode(
    x=alt.X('audience:N', 
            title='Target Audience',
            sort=audience_order,
            axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count:Q', title='Number of Papers'),
    color=alt.Color('category:N', 
                    title='Stakeholder Status',
                    scale=alt.Scale(domain=list(stakeholder_colors.keys()), 
                                  range=list(stakeholder_colors.values()))),
    xOffset='category:N',
    tooltip=['audience:N', 'category:N', 'count:Q']
).properties(
    width=800,
    height=400,
    title='Figure 2A: Stakeholder Mention vs. Involvement by Target Audience'
)

fig2_option_a

In [8]:
fig2_option_b = alt.Chart(df_stakeholder).mark_bar().encode(
    x=alt.X('count:Q', title='Number of Papers'),
    y=alt.Y('audience:N', 
            title='Target Audience',
            sort=audience_order),
    color=alt.Color('category:N', 
                    title='Stakeholder Status',
                    scale=alt.Scale(domain=list(stakeholder_colors.keys()), 
                                  range=list(stakeholder_colors.values()))),
    facet=alt.Facet('category:N', 
                    title='Stakeholder Status',
                    columns=2),
    tooltip=['audience:N', 'category:N', 'count:Q']
).properties(
    width=350,
    height=400,
    title='Figure 2B: Stakeholder Mention vs. Involvement by Target Audience (Separated Views)'
)

fig2_option_b

In [9]:
gap_data = []
for audience in df_audiences_clean['audience'].unique():
    audience_subset = df_audiences_clean[df_audiences_clean['audience'] == audience]
    
    mentioned_yes = len(audience_subset[audience_subset['stakeholders_mentioned'] == 'Yes'])
    involved_yes = len(audience_subset[audience_subset['stakeholders_involved'] == 'Yes'])
    
    gap_data.append({
        'audience': audience,
        'mentioned': mentioned_yes,
        'involved': involved_yes,
        'gap': mentioned_yes - involved_yes,
        'involved_percentage': (involved_yes / mentioned_yes * 100) if mentioned_yes > 0 else 0
    })

df_gap = pd.DataFrame(gap_data)
df_gap = df_gap.sort_values('mentioned', ascending=True)

chart_data = df_gap.copy()

mentioned_df = chart_data[['audience', 'mentioned']].copy()
mentioned_df['category'] = 'Papers that Mentioned Stakeholders'
mentioned_df['value'] = mentioned_df['mentioned']

involved_df = chart_data[['audience', 'involved']].copy()
involved_df['category'] = 'Papers that Involved Stakeholders'  
involved_df['value'] = involved_df['involved']

legend_data = pd.concat([mentioned_df[['audience', 'category', 'value']], 
                        involved_df[['audience', 'category', 'value']]], 
                       ignore_index=True)

color_scale = alt.Scale(
    domain=['Papers that Mentioned Stakeholders', 'Papers that Involved Stakeholders'],
    range=['#4c78a8', '#e45756']
)

fig2_option_c = alt.Chart(legend_data).mark_bar().encode(
    x=alt.X('value:Q', 
            title='Number of Papers', 
            scale=alt.Scale(domain=[0, chart_data['mentioned'].max() + 2])),
    y=alt.Y('audience:N', 
            title='Target Audience',
            sort=alt.EncodingSortField(field='value', op='max', order='descending')),
    color=alt.Color('category:N',
                    title='Stakeholder Status',
                    scale=color_scale,
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=11,
                        padding=10
                    )),
    opacity=alt.condition(
        alt.datum.category == 'Papers that Mentioned Stakeholders',
        alt.value(1.0),
        alt.value(1.0)
    ),
    tooltip=['audience:N', 'category:N', 'value:Q']
).properties(
    width=600,
    height=400,
    title='Figure 2C: Gap Between Stakeholder Mention and Involvement by Audience'
)

text_labels = alt.Chart(chart_data).mark_text(
    align='left', 
    dx=3, 
    fontSize=10,
    color='black'
).encode(
    x=alt.X('mentioned:Q'),
    y=alt.Y('audience:N', sort=alt.EncodingSortField(field='mentioned', order='descending')),
    text=alt.Text('mentioned:Q')
)

text_labels_involved = alt.Chart(chart_data).mark_text(
    align='left', 
    dx=3, 
    fontSize=10,
    color='white',
    fontWeight='bold'
).encode(
    x=alt.X('involved:Q'),
    y=alt.Y('audience:N', sort=alt.EncodingSortField(field='mentioned', order='descending')),
    text=alt.condition(alt.datum.involved > 0, alt.Text('involved:Q'), alt.value(''))
)

fig2_option_c = fig2_option_c + text_labels + text_labels_involved

fig2_option_c

In [10]:

stakeholder_data = df_clean[['stakeholders_mentioned', 'stakeholders_involved']].dropna()

comparison_data = []

mentioned_yes = len(stakeholder_data[stakeholder_data['stakeholders_mentioned'] == 'Yes'])
mentioned_no = len(stakeholder_data[stakeholder_data['stakeholders_mentioned'] == 'No'])

involved_yes = len(stakeholder_data[stakeholder_data['stakeholders_involved'] == 'Yes'])
involved_no = len(stakeholder_data[stakeholder_data['stakeholders_involved'] == 'No'])

comparison_data.extend([
    {'category': 'Stakeholders Mentioned', 'response': 'Yes', 'count': mentioned_yes},
    {'category': 'Stakeholders Mentioned', 'response': 'No', 'count': mentioned_no},
    {'category': 'Stakeholders Involved', 'response': 'Yes', 'count': involved_yes},
    {'category': 'Stakeholders Involved', 'response': 'No', 'count': involved_no}
])

df_comparison = pd.DataFrame(comparison_data)

df_comparison['total'] = df_comparison.groupby('category')['count'].transform('sum')
df_comparison['percentage'] = (df_comparison['count'] / df_comparison['total'] * 100).round(1)

print(f"\nComparison Summary:")
print(f"Papers that mentioned stakeholders: {mentioned_yes} out of {len(stakeholder_data)} ({mentioned_yes/len(stakeholder_data)*100:.1f}%)")
print(f"Papers that involved stakeholders: {involved_yes} out of {len(stakeholder_data)} ({involved_yes/len(stakeholder_data)*100:.1f}%)")
print(f"Gap (mentioned but not involved): {mentioned_yes - involved_yes} papers")

response_colors = {
    'Yes':  '#1f77b4',
    'No': '#d62728'
}

fig4_contrast = alt.Chart(df_comparison).mark_bar().encode(
    x=alt.X('category:N',
            title='Stakeholder Engagement',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count:Q',
            title='Number of Papers'),
    color=alt.Color('response:N',
                    title='Response',
                    scale=alt.Scale(domain=list(response_colors.keys()),
                                range=list(response_colors.values())),
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=11,
                        padding=10
                    )),
    xOffset='response:N',
    tooltip=[
        alt.Tooltip('category:N', title='Category'),
        alt.Tooltip('response:N', title='Response'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=600,
    height=400,
    title='Gap Between Stakeholder Mention and Involvement'
)

text_labels = alt.Chart(df_comparison).mark_text(
    dy=-5,
    fontSize=11,
    fontWeight='bold'
).encode(
    x=alt.X('category:N'),
    y=alt.Y('count:Q'),
    text=alt.Text('count:Q'),
    xOffset='response:N',
    color=alt.value('black')
)

fig4_contrast = fig4_contrast + text_labels

fig4_contrast


Comparison Summary:
Papers that mentioned stakeholders: 15 out of 60 (25.0%)
Papers that involved stakeholders: 16 out of 60 (26.7%)
Gap (mentioned but not involved): -1 papers


In [11]:
fig3_data = df_clean[['prior_user_research', 'tool_evaluation', 'tool_integration']].dropna()
fig3_data['research_eval_combo'] = fig3_data['prior_user_research'] + ' Research, ' + fig3_data['tool_evaluation'] + ' Evaluation'

combination_data = []
for idx, row in fig3_data.iterrows():
    # Create a label for the combination
    combo_parts = []
    if row['prior_user_research'] == 'Yes':
        combo_parts.append('Research')
    if row['tool_evaluation'] == 'Yes':
        combo_parts.append('Evaluation')
    if row['tool_integration'] == 'Yes':
        combo_parts.append('Integration')
    
    if not combo_parts:
        combo_label = 'None'
    else:
        combo_label = ' + '.join(combo_parts)
    
    combination_data.append({
        'combination': combo_label,
        'count': 1,
        'has_research': row['prior_user_research'] == 'Yes',
        'has_evaluation': row['tool_evaluation'] == 'Yes',
        'has_integration': row['tool_integration'] == 'Yes'
    })

df_combo = pd.DataFrame(combination_data)
df_combo_agg = df_combo.groupby('combination').agg({
    'count': 'sum',
    'has_research': 'first',
    'has_evaluation': 'first',
    'has_integration': 'first'
}).reset_index()

# Sort by count
df_combo_agg = df_combo_agg.sort_values('count', ascending=True)

def get_color(row):
    aspects = sum([row['has_research'], row['has_evaluation'], row['has_integration']])
    if aspects == 0:
        return '#d3d3d3'
    elif aspects == 1:
        return '#9ecae1'
    elif aspects == 2:
        return '#4292c6'
    else:
        return '#084594'

df_combo_agg['color'] = df_combo_agg.apply(get_color, axis=1)
df_combo_agg['aspects_count'] = df_combo_agg.apply(lambda x: sum([x['has_research'], x['has_evaluation'], x['has_integration']]), axis=1)

fig3_option_a = alt.Chart(df_combo_agg).mark_bar().encode(
    y=alt.Y('combination:N', 
            title='Combination of Methods',
            sort=alt.EncodingSortField(field='count', order='ascending')),
    x=alt.X('count:Q', title='Number of Papers'),
    color=alt.Color('aspects_count:O', 
                    title='Number of Methods Used',
                    scale=alt.Scale(domain=[0, 1, 2, 3], 
                                  range=['#d3d3d3', '#9ecae1', '#4292c6', '#084594'])),
    tooltip=['combination:N', 'count:Q']
).properties(
    width=600,
    height=400,
    title='Research, Evaluation, and Integration Methods'
)

text = alt.Chart(df_combo_agg).mark_text(align='left', dx=3).encode(
    y=alt.Y('combination:N', sort=alt.EncodingSortField(field='count', order='ascending')),
    x='count:Q',
    text='count:Q'
)

fig3_option_a = fig3_option_a + text
fig3_option_a

In [12]:
combination_data = []
for idx, row in fig3_data.iterrows():
    combo_parts = []
    if row['prior_user_research'] == 'Yes':
        combo_parts.append('Research')
    if row['tool_evaluation'] == 'Yes':
        combo_parts.append('Evaluation')
    if row['tool_integration'] == 'Yes':
        combo_parts.append('Integration')

    if not combo_parts:
        combo_label = 'None'
    else:
        combo_label = ' + '.join(combo_parts)

    combination_data.append({
        'combination': combo_label,
        'count': 1,
        'has_research': row['prior_user_research'] == 'Yes',
        'has_evaluation': row['tool_evaluation'] == 'Yes',
        'has_integration': row['tool_integration'] == 'Yes'
    })

df_combo = pd.DataFrame(combination_data)
df_combo_agg = df_combo.groupby('combination').agg({
    'count': 'sum',
    'has_research': 'first',
    'has_evaluation': 'first',
    'has_integration': 'first'
}).reset_index()

# Sort by count (descending for vertical bars)
df_combo_agg = df_combo_agg.sort_values('count', ascending=False)

# Calculate number of aspects for each combination
df_combo_agg['aspects_count'] = df_combo_agg.apply(
    lambda x: sum([x['has_research'], x['has_evaluation'], x['has_integration']]),

    axis=1
)
aspect_colors = {
    0: '#d62728',
    1: '#ff7f0e',
    2: '#2ca02c',
    3: '#1f77b4'
}

fig3_option_a = alt.Chart(df_combo_agg).mark_bar().encode(
    x=alt.X('combination:N',
            title='Combination of Methods',
            sort=alt.EncodingSortField(field='count', order='descending'),
            axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count:Q',
            title='Number of Papers'),
    color=alt.Color('aspects_count:O',
                    title='Number of Methods Used',
                    scale=alt.Scale(
                        domain=[0, 1, 2, 3],
                        range=[aspect_colors[0], aspect_colors[1],
aspect_colors[2], aspect_colors[3]]
                    ),
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=11
                    )),
    tooltip=[
        alt.Tooltip('combination:N', title='Method Combination'),
        alt.Tooltip('count:Q', title='Number of Papers'),
        alt.Tooltip('aspects_count:O', title='Number of Methods Used')
    ]
).properties(
    width=600,
    height=400,
    title='Research, Evaluation, and Integration Methods'
)

text = alt.Chart(df_combo_agg).mark_text(
    align='center',
    baseline='bottom',
    dy=-5,
    fontSize=12,
    fontWeight='bold'
).encode(
    x=alt.X('combination:N', sort=alt.EncodingSortField(field='count',
order='descending')),
    y=alt.Y('count:Q'),
    text='count:Q',
    color=alt.value('black')
)

fig3_option_a = fig3_option_a + text
fig3_option_a

In [13]:
separate_data = []

for aspect, column in [('Prior User Research', 'prior_user_research'), 
                       ('Tool Evaluation', 'tool_evaluation'), 
                       ('Tool Integration', 'tool_integration')]:
    yes_count = (fig3_data[column] == 'Yes').sum()
    no_count = (fig3_data[column] == 'No').sum()
    
    separate_data.append({
        'aspect': aspect,
        'response': 'Yes',
        'count': yes_count
    })
    separate_data.append({
        'aspect': aspect,
        'response': 'No',
        'count': no_count
    })

df_separate = pd.DataFrame(separate_data)

response_colors = {
    'Yes': '#1f77b4',
    'No': '#d62728'
}


fig3_option_b = alt.Chart(df_separate).mark_bar().encode(
    x=alt.X('aspect:N', 
            title='Method Type',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count:Q', title='Number of Papers'),
    color=alt.Color('response:N', 
                    title='Used Method?',
                    scale=alt.Scale(domain=list(response_colors.keys()), 
                                  range=list(response_colors.values()))),
    xOffset='response:N',
    tooltip=['aspect:N', 'response:N', 'count:Q']
).properties(
    width=500,
    height=400,
    title='Adoption of Research, Evaluation, and Integration Methods'
)

df_separate_pct = df_separate.copy()
df_separate_pct['total'] = df_separate_pct.groupby('aspect')['count'].transform('sum')
df_separate_pct['percentage'] = (df_separate_pct['count'] / df_separate_pct['total'] * 100).round(1)
df_separate_pct['label'] = df_separate_pct['count'].astype(str) + ' (' + df_separate_pct['percentage'].astype(str) + '%)'

text = alt.Chart(df_separate_pct).mark_text(dy=-5).encode(
    x=alt.X('aspect:N', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count:Q'),
    text='label:N',
    xOffset='response:N'
)

fig3_option_b = fig3_option_b + text
fig3_option_b

In [14]:

matrix_data = fig3_data.groupby(['prior_user_research', 'tool_evaluation',
'tool_integration']).size().reset_index(name='count')

base = alt.Chart(matrix_data).properties(
    width=250,
    height=250
)

# Create rectangle layer
rect = base.mark_rect().encode(
    x=alt.X('prior_user_research:N', title='Prior User Research'),
    y=alt.Y('tool_evaluation:N', title='Tool Evaluation'),
    color=alt.Color('count:Q',
                    title='Number of Papers',
                    scale=alt.Scale(scheme='viridis')),
    tooltip=['prior_user_research:N', 'tool_evaluation:N', 'tool_integration:N', 'count:Q']
)

# Create text layer
text = base.mark_text(color='white', fontSize=16, fontWeight='bold').encode(
    x=alt.X('prior_user_research:N'),
    y=alt.Y('tool_evaluation:N'),
    text='count:Q'
)

# Layer first, then facet
fig3_option_c = alt.layer(rect, text).facet(
    facet='tool_integration:N',
    columns=2,
    title='Tool Integration'
).resolve_scale(
    color='independent'
).properties(
    title={
        'text': 'Figure 3C: Research vs. Evaluation Matrix by Integration Status',
        'anchor': 'start'
    }
).configure_facet(
    spacing=20
).configure_view(
    stroke=None
)

fig3_option_c

In [15]:
tool_temporal_data = df_clean[['year', 'tool_description']].dropna()
print(f"Rows excluded due to missing data in tool temporal analysis: {len(df_clean) - 
len(tool_temporal_data)}")

# Aggregate data by year and tool category
temporal_tool_agg = tool_temporal_data.groupby(['year',
'tool_description']).size().reset_index(name='count')

tool_colors = {
    'framework': '#1f77b4',
    'app': '#ff7f0e',
    'datasheet': '#2ca02c',
    'questionnaire': '#d62728',
    'markup format': '#9467bd', 
    'toolkit': '#8c564b'
}

fig5_area = alt.Chart(temporal_tool_agg).mark_area().encode(
    x=alt.X('year:O',
            title='Publication Year',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count:Q',
            title='Number of Tools',
            stack='zero'),
    color=alt.Color('tool_description:N',
                    title='Tool Category',
                    scale=alt.Scale(domain=list(tool_colors.keys()),
                                range=list(tool_colors.values())),
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=10,
                        padding=10
                    )),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('tool_description:N', title='Tool Category'),
        alt.Tooltip('count:Q', title='Count')
    ]
).properties(
    width=800,
    height=400,
    title='Figure 5A: Tool Categories Development Over Time (Stacked Area)'
)

# Alternative: Grouped bar chart
fig5_bars = alt.Chart(temporal_tool_agg).mark_bar().encode(
    x=alt.X('year:O',
            title='Publication Year',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count:Q',
            title='Number of Tools'),
    color=alt.Color('tool_description:N',
                    title='Tool Category',
                    scale=alt.Scale(domain=list(tool_colors.keys()),
                                range=list(tool_colors.values())),
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=10,
                        padding=10
                    )),
    xOffset='tool_description:N',
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('tool_description:N', title='Tool Category'),
        alt.Tooltip('count:Q', title='Count')
    ]
).properties(
    width=800,
    height=400,
    title='Development of Tools Over Time'
)

# Alternative: Line chart showing trends
fig5_lines = alt.Chart(temporal_tool_agg).mark_line(
    point=alt.OverlayMarkDef(size=60)
).encode(
    x=alt.X('year:O',
            title='Publication Year',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count:Q',
            title='Number of Tools'),
    color=alt.Color('tool_description:N',
                    title='Tool Category',
                    scale=alt.Scale(domain=list(tool_colors.keys()),
                                range=list(tool_colors.values())),
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=10,
                        padding=10
                    )),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('tool_description:N', title='Tool Category'),
        alt.Tooltip('count:Q', title='Count')
    ]
).properties(
    width=800,
    height=400,
    title='Figure 5C: Tool Categories Development Over Time (Line Trends)'
)

print("\n=== Tool Categories Over Time Analysis ===")
print("Choose your preferred visualization:")
print("- fig5_area: Stacked area chart showing cumulative development")
print("- fig5_bars: Grouped bar chart for direct comparison by year")
print("- fig5_lines: Line chart showing trend patterns")

fig5_bars

Rows excluded due to missing data in tool temporal analysis: 9

=== Tool Categories Over Time Analysis ===
Choose your preferred visualization:
- fig5_area: Stacked area chart showing cumulative development
- fig5_bars: Grouped bar chart for direct comparison by year
- fig5_lines: Line chart showing trend patterns


In [16]:
tool_dist_data = df_clean[['tool_description']].dropna()

# Calculate distribution
tool_counts = tool_dist_data['tool_description'].value_counts().reset_index()
tool_counts.columns = ['tool_description', 'count']

tool_counts['percentage'] = (tool_counts['count'] / tool_counts['count'].sum() * 100).round(1)
tool_counts['label'] = tool_counts['count'].astype(str) + ' (' + tool_counts['percentage'].astype(str) + '%)'

print(f"\nTool Distribution Summary:")
print(f"Total tools analyzed: {tool_counts['count'].sum()}")
print("\nBreakdown by category:")
for _, row in tool_counts.iterrows():
    print(f"- {row['tool_description']}: {row['count']} ({row['percentage']}%)")

tool_colors = {
    'framework': '#1f77b4',
    'app': '#ff7f0e',
    'datasheet': '#2ca02c',
    'questionnaire': '#d62728',
    'markup format': '#9467bd',
    'toolkit': '#8c564b',
    'recommendations': '#e377c2'
}

fig6_bars = alt.Chart(tool_counts).mark_bar().encode(
    x=alt.X('count:Q',
            title='Number of Tools'),
    y=alt.Y('tool_description:N',
            title='Tool Category',
            sort='-x'),
    color=alt.Color('tool_description:N',
                    scale=alt.Scale(domain=list(tool_colors.keys()),
                                range=list(tool_colors.values())),
                    legend=None),
    tooltip=[
        alt.Tooltip('tool_description:N', title='Tool Category'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=600,
    height=400,
    title='Figure 6A: Distribution of Tool Types (Horizontal Bars)'
)

text_labels = alt.Chart(tool_counts).mark_text(
    align='left',
    dx=5,
    fontSize=12
).encode(
    x=alt.X('count:Q'),
    y=alt.Y('tool_description:N', sort='-x'),
    text='label:N',
    color=alt.value('black')
)

fig6_bars = fig6_bars + text_labels

# Pie/Donut chart
fig6_pie = alt.Chart(tool_counts).mark_arc(innerRadius=50).encode(
    theta=alt.Theta('count:Q', stack=True),
    color=alt.Color('tool_description:N',
                    title='Tool Category',
                    scale=alt.Scale(domain=list(tool_colors.keys()),
                                range=list(tool_colors.values())),
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=10,
                        padding=10
                    )),
    tooltip=[
        alt.Tooltip('tool_description:N', title='Tool Category'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=500,
    height=400,
    title='Figure 6B: Distribution of Tool Types (Donut Chart)'
)

pie_text = alt.Chart(tool_counts).mark_text(radius=140, fontSize=12).encode(
    theta=alt.Theta('count:Q', stack=True),
    text='percentage:Q',
    color=alt.value('white')
)

fig6_pie = fig6_pie + pie_text

# Vertical bar chart with percentages
fig6_vertical = alt.Chart(tool_counts).mark_bar().encode(
    x=alt.X('tool_description:N',
            title='Tool Category',
            sort='-y',
            axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count:Q',
            title='Number of Tools'),
    color=alt.Color('tool_description:N',
                    scale=alt.Scale(domain=list(tool_colors.keys()),
                                range=list(tool_colors.values())),
                    legend=None),
    tooltip=[
        alt.Tooltip('tool_description:N', title='Tool Category'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=600,
    height=400,
    title='Distribution of Tool Types'
)

vertical_text = alt.Chart(tool_counts).mark_text(
    dy=-5,
    fontSize=11,
    fontWeight='bold'
).encode(
    x=alt.X('tool_description:N', sort='-y'),
    y=alt.Y('count:Q'),
    text='count:Q',
    color=alt.value('black')
)

vertical_pct = alt.Chart(tool_counts).mark_text(
    dy=-18,
    fontSize=10
).encode(
    x=alt.X('tool_description:N', sort='-y'),
    y=alt.Y('count:Q'),
    text=alt.Text('percentage:Q', format='.1f'),
    color=alt.value('gray')
)

fig6_vertical = fig6_vertical + vertical_text + vertical_pct

print("\n=== Tool Distribution Visualizations ===")
print("Choose your preferred visualization:")
print("- fig6_bars: Horizontal bar chart (best for readability)")
print("- fig6_pie: Donut chart (shows proportions clearly)")
print("- fig6_vertical: Vertical bar chart with count and percentage labels")

fig6_vertical


Tool Distribution Summary:
Total tools analyzed: 51

Breakdown by category:
- framework: 15 (29.4%)
- app: 13 (25.5%)
- datasheet: 8 (15.7%)
- questionnaire: 8 (15.7%)
- markup format: 5 (9.8%)
- toolkit: 2 (3.9%)

=== Tool Distribution Visualizations ===
Choose your preferred visualization:
- fig6_bars: Horizontal bar chart (best for readability)
- fig6_pie: Donut chart (shows proportions clearly)
- fig6_vertical: Vertical bar chart with count and percentage labels


In [17]:
audience_counts = df_audiences['audience'].value_counts().reset_index()
audience_counts.columns = ['audience', 'count']

# Calculate percentages
audience_counts['percentage'] = (audience_counts['count'] / audience_counts['count'].sum() * 100).round(1)
audience_counts['label'] = audience_counts['count'].astype(str) + ' (' + audience_counts['percentage'].astype(str) + '%)'

audience_color_mapping = {
    'dataset creators': '#1f77b4',
    'dataset users': '#ff7f0e',
    'data curators': '#2ca02c',
    'dataset practitioners': '#d62728',
    'data experts': '#9467bd',
    'dataset auditors': '#8c564b',
    'data practitioners': '#e377c2',
    'data organizations': '#7f7f7f',
    'data experts. dataset researchers': '#bcbd22',
    'dataset researchers': '#17becf'
}

audience_counts['color'] = audience_counts['audience'].map(audience_color_mapping)

# Horizontal bar chart 
fig7_bars = alt.Chart(audience_counts).mark_bar().encode(
    x=alt.X('count:Q',
            title='Number of Mentions'),
    y=alt.Y('audience:N',
            title='Target Audience',
            sort='-x'),
    color=alt.Color('audience:N',
                    scale=alt.Scale(
                        domain=list(audience_color_mapping.keys()),
                        range=list(audience_color_mapping.values())
                    ),
                    legend=None),  # No legend needed since audiences are on Y axis
    tooltip=[
        alt.Tooltip('audience:N', title='Audience Type'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=700,
    height=400,
    title='Target Audiences for Tools'
)

text_labels = alt.Chart(audience_counts).mark_text(
    align='left',
    dx=5,
    fontSize=11
).encode(
    x=alt.X('count:Q'),
    y=alt.Y('audience:N', sort='-x'),
    text='label:N',
    color=alt.value('black')
)

fig7_bars = fig7_bars + text_labels
fig7_bars

# Treemap visualization 
fig7_treemap = alt.Chart(audience_counts).mark_rect().encode(
    x=alt.X('percentage:Q',
            scale=alt.Scale(domain=[0, 100]),
            axis=None),
    y=alt.Y('audience:N',
            sort='-x',
            axis=alt.Axis(title='Target Audience', labelLimit=200)),
    color=alt.Color('percentage:Q',
                    scale=alt.Scale(scheme='blues'),
                    title='Percentage',
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=10
                    )),
    tooltip=[
        alt.Tooltip('audience:N', title='Audience Type'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=600,
    height=400,
    title='Figure 7B: Proportion of Target Audiences (Heatmap Style)'
)

treemap_text = alt.Chart(audience_counts).mark_text(
    align='center',
    baseline='middle',
    fontSize=12,
    fontWeight='bold'
).encode(
    x=alt.X('percentage:Q', scale=alt.Scale(domain=[0, 100])),
    y=alt.Y('audience:N', sort='-x'),
    text='count:Q',
    color=alt.condition(
        alt.datum.percentage > 20,
        alt.value('white'),
        alt.value('black')
    )
)

fig7_treemap = fig7_treemap + treemap_text

# Bubble chart
import numpy as np
n_audiences = len(audience_counts)
angles = np.linspace(0, 2*np.pi, n_audiences, endpoint=False)
radius_scale = 5

audience_counts['x'] = np.cos(angles) * radius_scale
audience_counts['y'] = np.sin(angles) * radius_scale

fig7_bubble = alt.Chart(audience_counts).mark_circle().encode(
    x=alt.X('x:Q',
            scale=alt.Scale(domain=[-8, 8]),
            axis=None),
    y=alt.Y('y:Q',
            scale=alt.Scale(domain=[-8, 8]),
            axis=None),
    size=alt.Size('count:Q',
                scale=alt.Scale(range=[100, 3000]),
                title='Number of Mentions',
                legend=alt.Legend(
                    orient='bottom',
                    titleFontSize=12,
                    labelFontSize=10
                )),
    color=alt.Color('audience:N',
                    scale=alt.Scale(scheme='category20'),
                    title='Audience Type',
                    legend=alt.Legend(
                        orient='right',
                        titleFontSize=12,
                        labelFontSize=10,
                        columns=1
                    )),
    tooltip=[
        alt.Tooltip('audience:N', title='Audience Type'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=600,
    height=600,
    title='Figure 7C: Target Audiences Bubble Chart'
)

bubble_text = alt.Chart(audience_counts[audience_counts['count'] > 2]).mark_text(
    fontSize=10,
    fontWeight='bold'
).encode(
    x='x:Q',
    y='y:Q',
    text='audience:N',
    color=alt.value('black')
)

fig7_bubble = fig7_bubble + bubble_text

fig7_bars

In [18]:
audience_counts = df_audiences['audience'].value_counts().reset_index()
audience_counts.columns = ['audience', 'count']

audience_counts['percentage'] = (audience_counts['count'] /
audience_counts['count'].sum() * 100).round(1)

audience_color_mapping = {
    'dataset creators': '#1f77b4', 
    'dataset users': '#ff7f0e',  
    'data curators': '#2ca02c',
    'dataset practitioners': '#d62728',
    'data experts': '#9467bd',
    'dataset auditors': '#8c564b',
    'data practitioners': '#e377c2',
    'data organizations': '#7f7f7f',
    'data experts. dataset researchers': '#bcbd22',
    'dataset researchers': '#17becf'
}

audience_counts['color'] = audience_counts['audience'].map(audience_color_mapping)

fig7_bars = alt.Chart(audience_counts).mark_bar().encode(
    x=alt.X('audience:N',
            title='Target Audience',
            sort='-y',
            axis=alt.Axis(labelAngle=-45, labelLimit=200)),
    y=alt.Y('count:Q',
            title='Number of Mentions'),
    color=alt.Color('audience:N',
                    scale=alt.Scale(
                        domain=list(audience_color_mapping.keys()),
                        range=list(audience_color_mapping.values())
                    ),
                    legend=None),  # No legend needed since audiences are on X axis
    tooltip=[
        alt.Tooltip('audience:N', title='Audience Type'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=600,
    height=400,
    title='Target Audiences for Tools'
)

text_labels = alt.Chart(audience_counts).mark_text(
    align='center',
    baseline='bottom',
    dy=-5,
    fontSize=11,
    fontWeight='bold'
).encode(
    x=alt.X('audience:N', sort='-y'),
    y=alt.Y('count:Q'),
    text='count:Q',
    color=alt.value('black')
)

fig7_bars = fig7_bars + text_labels
fig7_bars