In [38]:
import json
import random
import pandas as pd

In [39]:
# Reading the json file

with open("data/internal-references-pdftotext.json", "r") as f:
    json_data = json.load(f)
list(json_data.items())[50:60]

[('alg-geom/9504005', []),
 ('alg-geom/9504017', []),
 ('alg-geom/9504008', []),
 ('alg-geom/9504006', ['alg-geom/9412003', 'alg-geom/9503003']),
 ('alg-geom/9504010', []),
 ('alg-geom/9504007', []),
 ('alg-geom/9504012', []),
 ('alg-geom/9504001', []),
 ('alg-geom/9504004', []),
 ('alg-geom/9504002', ['alg-geom/9407002'])]

In [40]:
orig_len = len(json_data)
print(orig_len)

1354753


In [41]:
n_sample = 10000

random.seed(42)
sample_keys = random.sample(list(json_data.keys()), n_sample)

data = {k: json_data[k] for k in sample_keys}

len(data)

10000

In [42]:
# Only keep the keys that have a value (paper with > 0 references)
data = {k: v for k, v in data.items() if v}
list(data.items())[0]

('hep-ex/0211012',
 ['hep-lat/0209122',
  'hep-ex/0206052',
  'hep-ex/0010054',
  'hep-ex/0205014',
  'hep-ex/0209070',
  'hep-ex/9710030',
  'hep-ph/0208188',
  'hep-ph/0207069',
  'hep-ex/0108034',
  'hep-ph/0102247',
  'hep-ph/0205069',
  'hep-ph/0106221',
  'hep-ex/0208037',
  'hep-ph/0205286',
  'hep-ex/0004021',
  'hep-ph/0110215'])

In [43]:
papers_main = [key for key in data for _ in data[key]]
papers_refs = [ref for key in data for ref in data[key]]

# top -> sub (citing -> cited)
df = pd.DataFrame({"top": papers_refs, "sub": papers_main})


df.shape, df.head()

((51620, 2),
                top             sub
 0  hep-lat/0209122  hep-ex/0211012
 1   hep-ex/0206052  hep-ex/0211012
 2   hep-ex/0010054  hep-ex/0211012
 3   hep-ex/0205014  hep-ex/0211012
 4   hep-ex/0209070  hep-ex/0211012)

In [44]:
# Remove duplicates rows

df = df.drop_duplicates(keep=False, inplace=False)

df.shape, df.head()

((51620, 2),
                top             sub
 0  hep-lat/0209122  hep-ex/0211012
 1   hep-ex/0206052  hep-ex/0211012
 2   hep-ex/0010054  hep-ex/0211012
 3   hep-ex/0205014  hep-ex/0211012
 4   hep-ex/0209070  hep-ex/0211012)

In [45]:
# Consider only the TOP papers that cite other papers, not those that do not cite any other paper
intersection = set(df["top"]).intersection(df["sub"])

# Filter the DataFrame using boolean indexing
filtered_df = df[df["top"].isin(intersection)]

# Update the DataFrame
df = filtered_df

# Sanity check
df.shape, df.head()

((318, 2),
                 top               sub
 153  hep-ph/9806409         1211.6125
 392  hep-ph/0312285  astro-ph/0409521
 627  hep-ph/0301040        1710.00184
 731  hep-th/0602021    hep-th/0605286
 850      1704.04456        1801.09710)

In [46]:
df.to_csv("data/citations_network.csv", sep="\t", encoding="utf-8", index=False)

In [47]:
# Read the ids and titles from "data/oai-arxiv-metadata-hash-abstracts-2019-03-01.json"

filename = "data/oai-arxiv-metadata-hash-abstracts-2019-03-01.json"
with open(filename, "r") as f:
    info_ = [json.loads(line) for line in f]
info_aXv = pd.DataFrame.from_dict(info_)
info_aXv = pd.DataFrame({"idAxv": info_aXv["id"], "title": info_aXv["title"]})
info_aXv["idAxv"] = info_aXv["idAxv"].apply(lambda x: "x" + str(x))
info_aXv["title"] = info_aXv["title"].apply(lambda x: x.translate({ord("\n"): None}))
info_aXv.to_csv(
    "data/infos_idd_names.csv", sep="\t", encoding="utf-8", index=False
)  # save for future use

In [48]:
filename = "data/infos_idd_names.csv"
info_aXv = pd.read_csv(filename, sep="\t", index_col=0)
info_aXv.reset_index(inplace=True)
info_aXv["idAxv"] = info_aXv["idAxv"].apply(lambda x: x[1:])

In [49]:
info_aXv.head()

Unnamed: 0,idAxv,title
0,704.0001,Calculation of prompt diphoton production cros...
1,704.0002,Sparsity-certifying Graph Decompositions
2,704.0003,The evolution of the Earth-Moon system based o...
3,704.0004,A determinant of Stirling cycle numbers counts...
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...
