In [None]:
import json
from collections import OrderedDict
import matplotlib.pyplot as plt
import seaborn as sns

from utils import get_year, get_category

%matplotlib inline

sns.set()

In [None]:
years = []
categories = []

with open('data/arxiv.json') as f:
    for line in f:
        data = json.loads(line)
        years.append(get_year(data["versions"]))
        categories.append(data["categories"])

print("Number of articles:", len(years))

In [None]:
year_and_cat_table = OrderedDict()
for y in sorted(set(years), reverse=True):
    year_and_cat_table[y] = {}

for i in range(len(categories)):
    y = years[i]
    cs = categories[i].split(" ")
    for c in cs:
        if c not in year_and_cat_table[y]:
            year_and_cat_table[y][c] = 0
        year_and_cat_table[y][c] += 1

for y in year_and_cat_table:
    d = year_and_cat_table[y]
    d = sorted(d.items(), key=lambda x: x[1], reverse=True)
    year_and_cat_table[y] = OrderedDict(d)

In [None]:
last_n_years = 5
top_n_categories = 5

years = []
cs = []
nums = []

for i, y in enumerate(year_and_cat_table):
    years.append(y)
    for j, c in enumerate(year_and_cat_table[y]):
        cs.append(get_category(c))
        nums.append(year_and_cat_table[y][c])
        if j == top_n_categories - 1:
            if i != last_n_years - 1:
                cs.append("")
                nums.append(0)
            break
    if i == last_n_years - 1:
        break

years.reverse()
cs.reverse()
nums.reverse()

In [None]:
cs_colors = [
    "#95B8D1",
    "#68A37B",
    "#E66A6C",
    "#F1D302",
    "#EDAFB8",
    "#FA9F42",
    "#DBF4A7",
    "#72CF53",
    ]
cs_colors = iter(cs_colors)
cmap = {}
colors = []
for c in cs:
    if c not in cmap:
        cmap[c] = next(cs_colors) if len(c) > 0 else "#000000"
    colors.append(cmap[c])

use_textures = False

if use_textures:
    cs_textures = [ "|" , "/" , "+" , "-", ".", "*","x", "O" ]
    cs_textures = iter(cs_textures)
    texture_map = {}
    textures = []
    for c in cs:
        if c not in texture_map:
            texture_map[c] = next(cs_textures) if len(c) > 0 else ""
        textures.append(texture_map[c])
else:
    textures = ["" for _ in cs]

In [None]:
fig, ax = plt.subplots(figsize=(25, 5))
plt.xticks(fontsize=14, rotation=45, ha="right")
plt.yticks(fontsize=14)
title = f"Most active research fields in the last {last_n_years} years"
ax.set_title(title, fontsize=24, fontweight="bold")
ax.set_ylabel("Number of papers on arXiv", fontsize=20)

bars = []
for i in range(len(cs)):
    bar = ax.bar(
        i,
        nums[i],
        width=0.7,
        color=colors[i],
        hatch=textures[i],
        edgecolor="black"
    )
    bars.append(bar[0])

ax.set_xticks(range(len(cs)))
ax.set_xticklabels(cs)
ax.set_ylim(0, max(nums) + 2000)

for i in range(len(bars)):
    if nums[i] > 0:
        x = bars[i].get_x() + bars[i].get_width() / 2.0
        y = bars[i].get_height()
        plt.text(x, y, nums[i], ha='center', va='bottom', fontsize=14)

for year, i in enumerate(range(2, len(bars), 6)):
    x = bars[i].get_x() + bars[i].get_width() / 2.0
    y = bars[i].get_height() + 3000
    plt.text(x, y, years[year], ha='center', va='bottom', fontsize=32)