In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import json
import numpy as np
from matplotlib.ticker import PercentFormatter
import matplotlib.ticker as ticker
from matplotlib.patches import Rectangle
from matplotlib.ticker import MaxNLocator

plt.rcParams['font.size'] = 20
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Helvetica Neue']
plt.rcParams['font.weight'] = 'light'
plt.rcParams['font.stretch'] = 'condensed'
plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['axes.linewidth'] = 2


In [101]:
## Helpers ##

def track_to_pretty_name(track):
    if track == "Phonology, Morphology and Word Segmentation":
        return "Phonology, Morphology and\nWord Segmentation"
    if track == "Linguistic Theories and Psycholinguistics":
        return "Linguistic Theories and\nPsycholinguistics"
    if track == "Machine Translation and Multilinguality":
        return "Machine Translation and\nMultilinguality"
    if track == "Multimodality, Speech and Grounding":
        return "Multimodality, Speech and\nGrounding"


    
    return track


track_to_color = {
    'Wildcard': 'gray',
    'Question Answering': '#ffd92f',
    "Dialogue": "#66c2a5",
    "Resources and Evaluation": "#e78ac3",
    "Multimodality, Speech and Grounding": "#fc8d62",
    "Information Extraction/Retrieval": "#8da0cb",
    "Interpretability and Analysis": "#FF0000",
    "Machine Translation and Multilinguality": "#a6d854",
    "Question Answering": "#ffd92f",
    "Semantics": "#e5c494",
    "Syntax": "#b3b3b3",
    "Applications": "#b15928",
    "Machine Learning": "#980e59",
    "Sentiment Analysis": "#ff7f00",
    "Generation": "#6a3d9a",
    "Other": "#999999",
    "Large Language Models": "#33a02c",
    "Summarization": "#33b02c",
    "Industry": "#11b02d",
    "Social Science": "#f3d121",
    "Theme": "#2618b4",
    "Ethics": "#1f77b4",
    "Linguistic Theories and Psycholinguistics": "#aec7e8",
    "Efficient Methods": "#ffbb78",
    "Discourse and Pragmatics": "#2ca02c",
    "Phonology, Morphology and Word Segmentation": "#98df8a",
    "Commonsense Reasoning": "#d62728",
    "Human-Centered NLP": "#ff9896",
    "Unsupervised and Weakly-Supervised Methods in NLP": "#9467bd",
    "Theory and Formalism in NLP": "#c5b0d5",
}

track_to_marker = {
    'Interpretability and Analysis': 's',         
    'Wildcard': 'o',                             
    "Dialogue": "H",                               
    "Resources and Evaluation": "X",               
    "Multimodality, Speech and Grounding": "D",   
    "Information Extraction/Retrieval": 10,  
    "Machine Translation and Multilinguality": "^", 
    "Question Answering": "v", 
    "Semantics": "*",     
    "Syntax": "h",        
    "Applications": "p",  
    "Machine Learning": "<",
    "Sentiment Analysis": "d",      
    "Generation": 6,          
    "Other": 'o',
    "Large Language Models": ">",
    "Summarization": ">",        
    "Industry": ">",
    "Social Science": ">",
    "Theme": "^",        
    "Ethics": "v",       
    "Linguistic Theories and Psycholinguistics": "o",
    "Efficient Methods": "s",     
    "Discourse and Pragmatics": "D",
    "Phonology, Morphology and Word Segmentation": "+",
    "Commonsense Reasoning": "x",                 
    "Human-Centered NLP": "h",                    
    "Unsupervised and Weakly-Supervised Methods in NLP": "*",
    "Theory and Formalism in NLP": "p",
}

track_to_hatches = {
    'Wildcard': '///',
    "Dialogue": "++",
    "Resources and Evaluation": "xx",
    "Multimodality, Speech and Grounding": "--",
    "Information Extraction/Retrieval": "oo",
    "Interpretability and Analysis": "",
    "Machine Translation and Multilinguality": "//",
    "Question Answering": "|",
    "Semantics": "++",
    "Syntax": "xx",
    "Applications": "--",
    "Machine Learning": "..",
    "Sentiment Analysis": "oo",
    "Generation": "///",
    "Other": "|||",
    "Large Language Models": "++",
    "Summarization": "/o",
    "Industry": "--",
    "Social Science": "..",
    "Theme": "oo",
    "Ethics": "//",
    "Linguistic Theories and Psycholinguistics": "||",
    "Efficient Methods": "++",
    "Discourse and Pragmatics": "\\|",
    "Phonology, Morphology and Word Segmentation": "--",
    "Commonsense Reasoning": "..",
    "Human-Centered NLP": "oo",
    "Unsupervised and Weakly-Supervised Methods in NLP": "///",
    "Theory and Formalism in NLP": "|||"
}


In [135]:
## Number of papers per year in each track ##

df = pd.read_csv('count_by_track_per_year.csv')
print(df)
track_counts = df.drop(columns=['year']).sum()

top_5_tracks = track_counts.nlargest(5)

print(top_5_tracks)
df_melted = df.melt(id_vars='year', var_name='Track', value_name='Count')
df_melted['Year'] = df_melted['year'].astype(str)
plt.figure(figsize=(10, 4))

palette = {track: 'gray' for track in df_melted['Track'].unique()}
palette['Interpretability and Analysis'] = 'white'
ia_data = df_melted[df_melted['Track'] == 'Interpretability and Analysis']

ia_data = ia_data.sort_values(by='Year').reset_index(drop=True)

sns.boxplot(data=df_melted, x='Year', y='Count', color='white', width=0.5, showcaps=False, boxprops={'facecolor':'None'}, showfliers=False, whiskerprops={'linewidth':0},  legend=False)

sns.swarmplot(data=df_melted, x='Year', y='Count', hue='Track', palette=palette, size=6, marker='o', edgecolor='k', dodge=False,  legend=False)
plt.plot(ia_data['Year'], ia_data['Count'], color='red', linestyle='-', linewidth=1, marker='o', zorder=3, label='IA')
plt.legend(loc='upper left', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(False)
#plt.legend(['IA'], ['red'], frameon=True)

plt.xlabel('Year', fontsize=18)
plt.ylabel('Number of Papers', fontsize=18)
plt.tight_layout()
with PdfPages('n-papers-per-year.pdf') as pdf:
    pdf.savefig()
    plt.close()
    
plt.show()

   year  Applications  Dialogue  Discourse and Pragmatics  Generation  \
0  2020         115.0     125.0                      25.0       103.0   
1  2021         111.0     126.0                      22.0        74.0   
2  2022         112.0      94.0                      13.0        76.0   
3  2023         178.0     142.0                      24.0       108.0   

   Information Extraction/Retrieval  Interpretability and Analysis  \
0                             159.0                           90.0   
1                             184.0                           85.0   
2                             135.0                          142.0   
3                             196.0                          160.0   

   Linguistic Theories and Psycholinguistics  Machine Learning  \
0                                       25.0             147.0   
1                                       18.0             146.0   
2                                       33.0             131.0   
3                  

In [84]:
## CSI scores ##

with open('csi_by_track_year.json', 'r') as file:
    csi_by_track_year = json.load(file)


years = sorted(csi_by_track_year.keys())
all_tracks = sorted({track for year in years for track in csi_by_track_year[year].keys()})

avg_csi = {track: np.nanmean([csi_by_track_year[year].get(track, np.nan) for year in years]) for track in all_tracks}
sorted_tracks = sorted(all_tracks, key=lambda track: avg_csi[track], reverse=True)

z = np.array([
    [csi_by_track_year[year].get(track, np.nan) for track in sorted_tracks]
    for year in years
])

annotation_text = [
    [f'{csi_by_track_year[year].get(track, np.nan) * 100:.1f}%' if track in csi_by_track_year[year] else 'N/A' for track in sorted_tracks]
    for year in years
]

fig, ax = plt.subplots(figsize=(25, 9))
cax = ax.matshow(z, cmap='plasma', vmin=np.nanmin(z), vmax=np.nanmax(z))

ax.set_xticks(np.arange(len(sorted_tracks)))
ax.set_xticklabels([track_to_pretty_name(track) for track in sorted_tracks], rotation=45, ha='left')
ax.set_yticks(np.arange(len(years)))
ax.set_yticklabels(years)

threshold = 0.5
for i in range(len(years)):
    for j in range(len(sorted_tracks)):
        text = annotation_text[i][j]
        if z[i, j] < threshold:
            color = 'white'
        else:
            color = 'black'
        ax.text(j, i, text, ha='center', va='center', color=color)
        rect = Rectangle((j - 0.5, i - 0.5), 1, 1, fill=False, edgecolor='black', linewidth=1.5)
        ax.add_patch(rect)

plt.tight_layout()
with PdfPages('csi-scores.pdf') as pdf:
    pdf.savefig()
    plt.close()
plt.show()

In [109]:
## Citations coming from Interp vs Non-Interp ##

df = pd.read_csv('interp_vs_non_interp_citations.csv', index_col=0)
ia_label = 'from IA'
non_ia_label = 'from non IA'
type_rename_dict = {'Interp': ia_label, 'Non-Interp': non_ia_label}
df['Type'] = df['Type'].map(type_rename_dict)
pivot_df = df.pivot(index='Year', columns='Type', values='Citations')

fig, ax = plt.subplots(figsize=(9, 6))
pivot_df[ia_label].plot(
    kind='line', marker=track_to_marker['Interpretability and Analysis'], ax=ax,
    color=track_to_color['Interpretability and Analysis'], label=ia_label, linewidth=3, markersize=10)
pivot_df[non_ia_label].plot(kind='line', marker=track_to_marker['Wildcard'],
                                     ax=ax, color=track_to_color['Wildcard'], label=non_ia_label, linewidth=3,  markersize=10)

plt.xticks(fontsize=24)
plt.yticks(fontsize=18)
plt.ylabel('Citations', fontsize=24)
plt.legend(fontsize=24)

ax.set_xticks(pivot_df.index)
ax.set_xticks(pivot_df.index, minor=True)
ax.grid(which='both',linewidth=0.5)

with PdfPages('citations-interp-vs-non-interp.pdf') as pdf:
    pdf.savefig()
    plt.close()
plt.show()

In [72]:
## Intra-track citations ##

with open('intra_track_citations_by_track.json', 'r') as file:
    track_citations = json.load(file)

data = []
for track, years in track_citations.items():
    years_list = sorted(years.keys())
    positive_ratios = [
        years[year]['positive'] / (years[year]['positive'] + years[year]['negative']) 
        if (years[year]['positive'] + years[year]['negative']) > 0 else 0
        for year in years_list
    ]
    for year, ratio in zip(years_list, positive_ratios):
        data.append({'Year': year, 'Ratio': ratio, 'Track': track})

df = pd.DataFrame(data)

unique_tracks = df['Track'].unique()
colors = [track_to_color[track] for track in unique_tracks]

plt.figure(figsize=(8, 5))
for track in [track for track in unique_tracks if track != 'Interpretability and Analysis'] + ['Interpretability and Analysis']:
    track_data = df[df['Track'] == track]
    marker = track_to_marker[track]
    sns.lineplot(data=track_data, x='Year', y='Ratio', hue='Track', palette=[track_to_color[track]], marker=marker, markersize=10, legend='full')

plt.xlabel('Year', fontsize=16)
plt.ylabel('Ratio of Intra-track Citations', fontsize=16, labelpad=1)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1, decimals=0))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True)
plt.subplots_adjust(left=0.15, right=0.2) 

handles, labels = plt.gca().get_legend_handles_labels()
pretty_labels = [track_to_pretty_name(label) for label in labels]
plt.legend(handles, pretty_labels, fontsize=14, title_fontsize=16, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()

with PdfPages('intra-track-citations.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()

In [73]:
## Citation intents distribution ##

with open('centralities_per_track.json', 'r') as file:
    centralities_per_track = json.load(file)

data = []
for track, track_centralities in centralities_per_track.items():
    if len(track_centralities) < 30:
        continue
    if track == 'Theme':
        continue
    for centrality in track_centralities:
        data.append({'Track': track, 'Centralities': centrality})

df = pd.DataFrame(data)

medians = df.groupby('Track')['Centralities'].median().reset_index(name='Median')
percentile_75 = df.groupby('Track')['Centralities'].quantile(0.75).reset_index(name='75th Percentile')

stats = pd.merge(medians, percentile_75, on='Track')

stats = stats.sort_values(by=['Median', '75th Percentile'], ascending=False)
sorted_tracks = stats['Track']

df['Track'] = pd.Categorical(df['Track'], categories=sorted_tracks, ordered=True)
df = df.sort_values('Track')

sns.set(style="whitegrid")

palette = {track: track_to_color['Wildcard'] for track in sorted_tracks}
palette['Interpretability and Analysis'] = track_to_color['Interpretability and Analysis']

plt.figure(figsize=(12, 7))
box_plot = sns.boxplot(y='Track', x='Centralities', data=df, palette=palette)

plt.xscale('log')

plt.yticks(rotation=0, fontsize=14)
plt.xticks(fontsize=14)

plt.xlabel('Centralities', fontsize=16)
plt.ylabel('')
plt.xlim(10**-8, 10**-2)


with PdfPages('centralities.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  box_plot = sns.boxplot(y='Track', x='Centralities', data=df, palette=palette)


In [74]:
df = pd.read_csv('intents.csv')

colors = {
    'Questionnaire\nPapers': 'blue',
    'Most Cited\nI/A Papers': 'green',
    'Most Cited\nPapers': 'red'
}

df['Source'] = df['Source'].replace({
    'Survey': 'Questionnaire\nPapers',
    'Top_50': 'Most Cited\nPapers',
    'Interpretability': 'Most Cited\nI/A Papers'
})

title_fontsize = 20
label_fontsize = 16
tick_fontsize = 16
legend_fontsize = 14

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.boxplot(ax=axes[0], data=df[df['Intent'] == 'Background'], x='Source', y='Value', hue='Source', palette=colors, fliersize=0)
sns.stripplot(ax=axes[0], data=df[df['Intent'] == 'Background'], x='Source', y='Value', hue='Source', palette=colors, jitter=True, dodge=False, linewidth=1, edgecolor='gray', size=6)
axes[0].set_title('Background Information', fontsize=title_fontsize)
axes[0].set_ylim(-0.00, 1)
axes[0].yaxis.set_major_formatter(PercentFormatter(1))
axes[0].set_xlabel('', fontsize=label_fontsize)
axes[0].set_ylabel('', fontsize=label_fontsize)
axes[0].tick_params(axis='both', which='major', labelsize=tick_fontsize)

sns.boxplot(ax=axes[1], data=df[df['Intent'] == 'Methodology'], x='Source', y='Value', hue='Source', palette=colors, fliersize=0)
sns.stripplot(ax=axes[1], data=df[df['Intent'] == 'Methodology'], x='Source', y='Value', hue='Source', palette=colors, jitter=True, dodge=False, linewidth=1, edgecolor='gray', size=6)
axes[1].set_title('Use of Methods', fontsize=title_fontsize)
axes[1].set_ylim(-0.00, 0.80)
axes[1].yaxis.set_major_formatter(PercentFormatter(1))
axes[1].set_xlabel('', fontsize=label_fontsize)
axes[1].set_ylabel('', fontsize=label_fontsize)
axes[1].tick_params(axis='both', which='major', labelsize=tick_fontsize)

sns.boxplot(ax=axes[2], data=df[df['Intent'] == 'Results'], x='Source', y='Value', hue='Source', palette=colors, fliersize=0)
sns.stripplot(ax=axes[2], data=df[df['Intent'] == 'Results'], x='Source', y='Value', hue='Source', palette=colors, jitter=True, dodge=False, linewidth=1, edgecolor='gray', size=6)
axes[2].set_title('Comparing Results', fontsize=title_fontsize)
axes[2].set_ylim(-0.00, 0.20)
axes[2].yaxis.set_major_formatter(PercentFormatter(1, decimals=0))
axes[2].set_yticks([i/100 for i in range(0, 21, 5)])  # Set ticks to 0, 5, 10, 15, 20%
axes[2].set_xlabel('', fontsize=label_fontsize)
axes[2].set_ylabel('', fontsize=label_fontsize)
axes[2].tick_params(axis='both', which='major', labelsize=tick_fontsize)

plt.tight_layout(rect=[0, 0, 1, 0.96])

handles, labels = axes[0].get_legend_handles_labels()
#fig.legend(handles[:len(colors)], labels[:len(colors)], loc='upper center', ncol=3, fontsize=legend_fontsize)
with PdfPages('intents.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()

  sns.stripplot(ax=axes[0], data=df[df['Intent'] == 'Background'], x='Source', y='Value', hue='Source', palette=colors, jitter=True, dodge=False, linewidth=1, edgecolor='gray', size=6)
  sns.stripplot(ax=axes[1], data=df[df['Intent'] == 'Methodology'], x='Source', y='Value', hue='Source', palette=colors, jitter=True, dodge=False, linewidth=1, edgecolor='gray', size=6)
  sns.stripplot(ax=axes[2], data=df[df['Intent'] == 'Results'], x='Source', y='Value', hue='Source', palette=colors, jitter=True, dodge=False, linewidth=1, edgecolor='gray', size=6)


In [75]:
## Citation Count vs Centralities ##
df = pd.read_csv('citations_vs_centralities.csv', index_col=0)
df['Centrality Values'] = df['Centrality Values'] * 1000
df['Citation Counts'] = df['Citation Counts'] / 1000

plt.figure(figsize=(6, 6))
sns.scatterplot(x='Citation Counts', y='Centrality Values', data=df, s=100)
plt.xlabel('Citation Counts ($10^{3}$)', fontsize=16)
plt.ylabel('Centrality ($10^{-3}$)', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True)
with PdfPages('centralities-vs-citations.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()

In [160]:
## Percentage of Citations vs Percentage of Papers ##

df = pd.read_csv('percentage_citations_and_papers_by_track.csv', index_col=0)
df['source_year'] = df['source'] + ' ' + df['year'].astype(str)

def plot_percentages(df, track, ax):
    data = df[df['track'] == track]

    bar_width = 0.4
    indices = range(len(data['source_year']))
    offset = 0.2

    paper_bars = ax.bar([i + offset for i in indices], data['paper_percentage'], label='Papers', color='#ff7f0e', width=bar_width, hatch='//')
    citation_bars = ax.bar([i - offset for i in indices], data['citation_percentage'], label='Citations', color='#1f77b4', width=bar_width)

    ax.set_title(track, fontsize=20)
    ax.set_ylim(0, 25)
    ax.set_xticks(indices)
    ax.set_xticklabels(data['source_year'], rotation=-90, fontsize=16)
    ax.set_yticklabels([f'{int(label)}%' for label in ax.get_yticks()], fontsize=16)
    return citation_bars, paper_bars

tracks = [
    'Interpretability and Analysis',
    'Information Extraction/Retrieval', # its the track with the most papers
    'Machine Learning' # its the track with the most citations
]
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

citation_bars = None
paper_bars = None
for i, track in enumerate(tracks):
    citation_bars, paper_bars = plot_percentages(df, track, axes[i])

fig.legend(handles=[citation_bars, paper_bars], labels=['Citations', 'Papers'], loc='upper center', bbox_to_anchor=(0.5, 0.05), ncol=2, fontsize=18)

plt.tight_layout()
plt.subplots_adjust(bottom=0.3)
with PdfPages('citations-vs-number-percentages.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()

  ax.set_yticklabels([f'{int(label)}%' for label in ax.get_yticks()], fontsize=16)
  ax.set_yticklabels([f'{int(label)}%' for label in ax.get_yticks()], fontsize=16)
  ax.set_yticklabels([f'{int(label)}%' for label in ax.get_yticks()], fontsize=16)


In [77]:
## References to IA ##

with open('percentages_of_references_to_interp.json', 'r') as file:
    percentages_of_references_to_interp = json.load(file)

data = []
for track, values in percentages_of_references_to_interp.items():
    if len(values) < 30:
        # it would be misleading to plot tracks with not many papers imo
        continue
    for value in values:
        if track != 'null' and track != 'Theme':
            data.append({'Track': track, 'Value': value / 100})

df = pd.DataFrame(data)
medians = df.groupby('Track')['Value'].median().sort_values(ascending=True)
palette = {track: track_to_color[track] for track in medians.index}

pretty_labels = [track_to_pretty_name(track) for track in medians.index]

plt.figure(figsize=(25, 8))
sns.boxplot(x='Track', y='Value', data=df, order=medians.index, palette=palette)

plt.gca().set_xticklabels(pretty_labels)

plt.gca().xaxis.tick_top()
plt.xlabel('')
plt.ylabel('Percentage of references to IA', fontsize=22)
plt.xticks(rotation=45, ha='left', fontsize=18)
plt.yticks(fontsize=16)
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
plt.tight_layout()

with PdfPages('references-to-ia.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()

plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Track', y='Value', data=df, order=medians.index, palette=palette)
  plt.gca().set_xticklabels(pretty_labels)


In [78]:
## References to IA ##

with open('citation_intents_distributions.json', 'r') as file:
    citation_intents_distributions = json.load(file)

data_to_plot = []
tracks = []
matrics = []

for track, metrics in citation_intents_distributions.items():
    for metric, values in metrics.items():
        if metric != 'background':
            continue
        if track == 'Theme':
            continue
        if len(values) < 20:
            continue
        data_to_plot.extend(values)
        tracks.extend([track] * len(values))
        matrics.extend([metric] * len(values))

df = pd.DataFrame({
    'Track': tracks,
    'Metric': matrics,
    'Percentage': data_to_plot
})

median_values = df.groupby(['Track', 'Metric'])['Percentage'].median().reset_index()
median_values = median_values.rename(columns={'Percentage': 'Median'})
df = pd.merge(df, median_values, on=['Track', 'Metric'])

df = df.sort_values(by='Median', ascending=False).reset_index(drop=True)

plt.figure(figsize=(22, 8))
order = df['Track'].unique()
palette = {track: track_to_color['Wildcard'] for track in sorted_tracks}
palette['Interpretability and Analysis'] = track_to_color['Interpretability and Analysis']

sns.boxplot(x='Track', y='Percentage', data=df[df['Track'] != 'Interpretability and Analysis'], order=order, palette=palette)
sns.boxplot(x='Track', y='Percentage', data=df[df['Track'] == 'Interpretability and Analysis'], 
            order=order, palette=[track_to_color['Interpretability and Analysis']], fliersize=0)


sns.stripplot(x='Track', y='Percentage', data=df[df['Track'] == 'Interpretability and Analysis'], 
              order=order, color=track_to_color['Interpretability and Analysis'], jitter=True, size=4, linewidth=1, edgecolor='gray')

plt.gca().xaxis.tick_top()
plt.xticks(ticks=range(len(df['Track'].unique())), 
           labels=[track_to_pretty_name(track) for track in df['Track'].unique()], 
           rotation=45, ha='left', fontsize=18)
plt.yticks(fontsize=16)
plt.xlabel('')
plt.ylabel('Percentage of background citations', fontsize=20)
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))

plt.tight_layout()
with PdfPages('background-intents.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Track', y='Percentage', data=df[df['Track'] != 'Interpretability and Analysis'], order=order, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Track', y='Percentage', data=df[df['Track'] == 'Interpretability and Analysis'],
The palette list has fewer values (1) than needed (21) and will cycle, which may produce an uninterpretable plot.
  sns.boxplot(x='Track', y='Percentage', data=df[df['Track'] == 'Interpretability and Analysis'],
  sns.stripplot(x='Track', y='Percentage', data=df[df['Track'] == 'Interpretability and Analysis'],


In [79]:
df = pd.read_csv('growth_rate_by_track.csv', index_col=0)
df = df.sort_values(by='0', ascending=False)

pretty_labels = [track_to_pretty_name(track) for track in df.index]
plt.figure(figsize=(10, 16))
palette = {track: track_to_color['Wildcard'] for track in pretty_labels}
palette['Interpretability and Analysis'] = track_to_color['Interpretability and Analysis']

sns.barplot(x=df['0'], y=pretty_labels, palette=palette)

plt.xlabel('Growth Percentage', fontsize=30)
plt.yticks(fontsize=26)
plt.xticks(fontsize=20)
plt.ylabel('')
plt.grid(axis='x', linestyle='--', linewidth=1.5, color='black')
plt.axvline(x=0, color='black', linewidth=1.5)

plt.subplots_adjust(left=0.2, top=0.9)

with PdfPages('growth-percentages.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=df['0'], y=pretty_labels, palette=palette)


In [110]:
## Citations proportion through time ##

df = pd.read_csv('citations_year_by_track.csv', index_col=0)


percentage_data = df.div(df.sum(axis=1), axis=0) * 100
normalized_data = percentage_data.div(percentage_data.sum(axis=1), axis=0) * 100
tracks = normalized_data.columns

sorted_tracks = sorted(tracks)
sorted_tracks.remove('Interpretability and Analysis')
sorted_tracks.insert(0, 'Interpretability and Analysis')

x = normalized_data.index
y = [normalized_data[track].values for track in sorted_tracks]
colors = [track_to_color[track] for track in sorted_tracks]
labels = [track_to_pretty_name(track) for track in sorted_tracks]
hatches = [track_to_hatches[track] for track in sorted_tracks]

fig, ax = plt.subplots(figsize=(10, 8))

stack = ax.stackplot(x, y, labels=labels, colors=colors, alpha=0.7)
for patch, hatch in zip(stack, hatches):
    patch.set_hatch(hatch)
    
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_ylabel('Percentage of Citations', fontsize=20)
ax.set_ylim(0, 100)
ax.set_xlim(2020, 2023)
plt.yticks(fontsize=18)
plt.xticks(fontsize=18)

handles, legend_labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], legend_labels[::-1], loc='upper left', bbox_to_anchor=(1.05, 1.05), frameon=False, fontsize=18)

ax.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
with PdfPages('citation-areas.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
plt.show()

In [159]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('citations_track_year.csv', index_col=0)

track_totals = df.groupby('Track')['Count'].sum().sort_values(ascending=False)

top_tracks = track_totals.head(5).index #+ ['Interpretability and Analysis']
top_tracks = list(top_tracks) + ['Interpretability and Analysis']
# next is top 5 biggest tracks
#top_tracks = ['Information Extraction/Retrieval', 'Machine Translation and Multilinguality', 'Machine Learning', 'Applications', 'Dialogue','Interpretability and Analysis']
top_tracks = ['Information Extraction/Retrieval', 'Machine Translation and Multilinguality', 'Applications', 'Dialogue','Interpretability and Analysis', 'Semantics']
print(top_tracks)
filtered_df = df[df['Track'].isin(top_tracks)]

plt.figure(figsize=(8, 3))

for track in filtered_df['Track'].unique():
    track_data = filtered_df[filtered_df['Track'] == track]
    plt.plot(track_data.index, track_data['Count'],color=track_to_color[track], marker=track_to_marker[track], label=track_to_pretty_name(track), markersize=10)

plt.ylabel('Citations', fontsize=15)
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)
plt.legend(bbox_to_anchor=(1.00, 1), loc='upper left', fontsize=15)
plt.grid(True)

plt.xlim(right=2023, left=2020)
plt.xticks(range(int(filtered_df.index.min()), 2024))
plt.tight_layout()
with PdfPages('citations-tracks.pdf') as pdf:
    pdf.savefig(bbox_inches='tight')
    plt.close()
# Show plot
plt.show()


['Information Extraction/Retrieval', 'Machine Translation and Multilinguality', 'Applications', 'Dialogue', 'Interpretability and Analysis', 'Semantics']
