In [23]:
import json
from collections import Counter
import pandas as pd

### Sample

{"citationid":1576440663,"citingcorpusid":145058343,"citedcorpusid":null,"isinfluential":false,"contexts":null,"intents":null}
{"citationid":2356978354,"citingcorpusid":24094241,"citedcorpusid":1920165,"isinfluential":false,"contexts":null,"intents":null}

In [10]:
new_data = []

with open('semantic_scholar_citations_sample_20250715/citations-part1.jsonl', 'r') as file:
    for line in file:
        new_item = json.loads(line)
        if new_item.get('citedcorpusid'):
            new_data.append(new_item.get('citingcorpusid'))

In [11]:
c = Counter(new_data)

In [12]:
c.most_common(5)

[(14222903, 107),
 (136795997, 90),
 (11155870, 87),
 (1500895, 86),
 (202684204, 82)]

In [13]:
new_data_all = []

with open('semantic_scholar_citations_sample_20250715/citations-part1.jsonl', 'r') as file:
    for line in file:
        new_item = json.loads(line)
        new_data_all.append(new_item.get('citingcorpusid'))

In [14]:
c_all = Counter(new_data_all)

In [15]:
c_all.most_common(5)

[(10848014, 658),
 (9067019, 128),
 (136795997, 117),
 (167936588, 113),
 (14222903, 107)]

In [41]:
df = pd.DataFrame.from_records(list(dict(c).items()), columns=['corpusid', 'count'])
df_all = pd.DataFrame.from_records(list(dict(c_all).items()), columns=['corpusid', 'count'])

In [58]:
df_per = df_all.merge(df, how='left', on='corpusid', suffixes=('_all', '_source'))
df_per['count_source'] = df_per['count_source'].fillna(value=0)
df_per['count_source'] = df_per['count_source'].astype(int)
df_per['share'] = df_per['count_source'] / df_per['count_all']

In [59]:
df_per.head(5)

Unnamed: 0,corpusid,count_all,count_source,share
0,273985564,2,2,1.0
1,215885109,2,2,1.0
2,21200586,1,1,1.0
3,210493829,2,2,1.0
4,37407888,1,1,1.0


In [62]:
df_per[(df_per.share < 0.5) & (df_per.share > 0.1) & (df_per.count_all > 10)]

Unnamed: 0,corpusid,count_all,count_source,share
49573,8554023,11,5,0.454545
68874,5618881,12,4,0.333333
136382,165559226,18,3,0.166667
143958,9936537,22,9,0.409091
144652,147242408,11,3,0.272727
...,...,...,...,...
8099817,211835631,12,3,0.250000
8456526,152842253,11,2,0.181818
8672763,151564129,12,3,0.250000
9035647,219827554,12,5,0.416667


## Test mapping

In [69]:
c.most_common(5)

[(14222903, 107),
 (136795997, 90),
 (11155870, 87),
 (1500895, 86),
 (202684204, 82)]

In [125]:
d1_test = c.most_common(5)
d2_test = c.most_common(5)
d3_test = c.most_common(10)
d4_test = c.most_common(15)

In [126]:
d_list = [d1_test, d2_test, d3_test, d4_test]

In [127]:
from multiprocessing import Pool
from operator import add
from functools import reduce

In [128]:
with Pool(processes=2) as pool:
    c_reduced = reduce(add, (Counter(dict(i)) for i in d_list))

In [129]:
c_reduced

Counter({14222903: 428,
         136795997: 360,
         11155870: 348,
         1500895: 344,
         202684204: 328,
         86108331: 158,
         5018677: 154,
         86234595: 130,
         278634262: 108,
         262346921: 106,
         73343816: 52,
         7825473: 49,
         247670914: 48,
         11815254: 45,
         1035356: 45})