## Setup

In [None]:
import numpy as np
import pandas as pd
import plotnine as pn

from collections import Counter

from sciterra.mapping.atlas import Atlas

In [None]:
# bibtex_fp = "hafenLowredshiftLymanLimit2017.bib"
# atlas_dir = "outputs/atlas_s2/"
# atlas_dir = "outputs/atlas_s2-7-19-23"
# atlas_dir = "outputs/atlas_s2-7-21-23_phil/"
atlas_dir = "outputs/atlas_s2-7-20-23_uncentered_imeletal/"

In [None]:
atl = Atlas.load(atlas_dir)

In [None]:
len(atl)

## Exploratory data analysis

In [None]:
# can we get information by year?
years = dict()
papers_per_year = dict()
words = Counter()

def get_binary_occurences(abstract: str):
    """Return a Counter that counts up to 1 for each word."""
    counter = Counter(abstract.split())
    for k in counter:
        counter[k] = 1
    return counter

for id in atl.publications:
    pub = atl[id]
    # there should be functionality to pass to atlas to 'filter if None' that removes all publications / embeddings that have certain attributes None
    if pub.publication_date is not None:
        year = pub.publication_date.year
        if year not in years:
            years[year] = Counter()
            papers_per_year[year] = 0
        years[year] += get_binary_occurences(pub.abstract)
        papers_per_year[year] += 1


In [None]:
papers_per_year

In [None]:
# divide the concept counts by the number of papers per year

for key in papers_per_year:
    counter = years[key]
    for word, freq in counter.items():
        counter[word] = freq / papers_per_year[key]


In [None]:
# now just remove years that don't have more than N papers?
years = {k:v for k,v in years.items() if papers_per_year[k] > 30}

Check not normalized

In [None]:
# write a function that can visualize the changing distribution over referents

dfs = []
for year, counter in sorted(years.items()):
    dfs.append(
        pd.DataFrame(
        [(k,v,year) for k,v in counter.items()],
        columns=["word", "fraction of papers mentioning", "year"]
        )
    )

data = pd.concat(dfs)

# lets just look at a few words for now
# words = [
#     "learning",
#     "hydrodynamic",
#     "simulations",
#     "neutrino",
#     "exoplanet",
#     "dark",
# ]

words = [
    "probability",
    "language",
    "communication",
    "vision",
    # "deep",
    "learning",
    "network",
    # "evolution",
    "symbolic",
    # "efficiency",
    # "machine",
]

# the results are pretty weird / counterintuitive for philosophy. We may need to implement a restriction to specific fields; though this will further limit our sample size.
# it feels likely that biomedical + computer science + physics + astronomy are the best represented in semantic scholar.
# words = [ 
#     "philosophy",
#     "evidence",
#     "theory",
#     "epistemic",
#     "semantic",
#     # confounds
#     "medical",
#     "prescription",
# ]

data = data[data["word"].isin(words)]
data

(
    pn.ggplot(data, pn.aes(x="year", y="fraction of papers mentioning"))
    + pn.geom_point(
        mapping=pn.aes(
            color="word",
        ),
        alpha=0.4,
    )
    + pn.geom_line(
        mapping=pn.aes(
            color="word",
        ),
        alpha=0.4,
    )
    + pn.geom_smooth(
        mapping=pn.aes(
            color="word",
        ),
        size=3,
    )
)

In [None]:

# lol bc idk how to fix faster
for key in years:
    counter = years[key]
    for word, freq in counter.items():
        counter[word] = freq * papers_per_year[key]

dfs = []
for year, counter in sorted(years.items()):
    dfs.append(
        pd.DataFrame(
        [(k,v,year) for k,v in counter.items()],
        columns=["word", "number of papers mentioning", "year"]
        )
    )

data = pd.concat(dfs)


data = data[data["word"].isin(words)]
data

(
    pn.ggplot(data, pn.aes(x="year", y="number of papers mentioning"))
    + pn.geom_point(
        mapping=pn.aes(
            color="word",
        )
    )
    + pn.geom_line(
        mapping=pn.aes(
            color="word",
        )
    )
)

In [None]:
sum([papers_per_year[k] for k in papers_per_year if papers_per_year[k] > 30])

In [None]:
list(years[2010].items())[:10]