In [1]:
import pandas as pd
import json
import re
import pickle

### Load paper matching data

In [2]:
# Exact match (doi or title) from the first scan
first = pd.read_csv('results/exact_matching.csv')
first.head()

Unnamed: 0,vispub_doi,aminer_id,method
0,10.1109/visual.1993.398878,53e9a819b7602d970315ddb4,doi
1,10.1109/visual.1990.146414,53e9b3fdb7602d9703eee319,doi
2,10.1109/visual.1990.146412,53e9b6afb7602d970422d945,doi
3,10.1109/visual.1993.398863,53e9be57b7602d9704b1803a,doi
4,10.1109/infvis.2004.39,53e9978db7602d9701f50690,title


In [3]:
# Verified match from the second scan
second = pd.read_csv('results/candidate_papers.csv')[['vispub_doi', 'aminer_id']]
second['method'] = 'string matching'
second.head()

Unnamed: 0,vispub_doi,aminer_id,method
0,10.1109/visual.2003.1250426,599c7eff601a182cd28e1385,string matching
1,10.1109/visual.1997.663919,53e99acab7602d970234ddd8,string matching
2,10.1109/visual.1997.663928,53e9bc05b7602d970486805b,string matching
3,10.1109/visual.2000.885740,53e9adffb7602d9703806035,string matching
4,10.1109/visual.1997.663922,53e9b4e9b7602d9704014dba,string matching


In [4]:
matches = pd.concat([first, second])
matches

Unnamed: 0,vispub_doi,aminer_id,method
0,10.1109/visual.1993.398878,53e9a819b7602d970315ddb4,doi
1,10.1109/visual.1990.146414,53e9b3fdb7602d9703eee319,doi
2,10.1109/visual.1990.146412,53e9b6afb7602d970422d945,doi
3,10.1109/visual.1993.398863,53e9be57b7602d9704b1803a,doi
4,10.1109/infvis.2004.39,53e9978db7602d9701f50690,title
...,...,...,...
50,10.1109/visual.1992.235174,53e9b408b7602d9703efa9df,string matching
51,10.1109/visual.1994.346291,53e9ab20b7602d97034ab8e7,string matching
52,10.1109/visual.1992.235188,53e9a291b7602d9702b97808,string matching
53,10.1109/visual.1992.235179,53e9b829b7602d97043e3e2f,string matching


In [5]:
ids = list(matches['aminer_id'])
dois = matches.set_index('aminer_id')['vispub_doi'].to_dict()

In [7]:
# Check the papers that has multiple aminer_id
m = matches.groupby('vispub_doi').agg({'aminer_id': 'count'})
m[m['aminer_id'] > 1]

Unnamed: 0_level_0,aminer_id
vispub_doi,Unnamed: 1_level_1
10.0000/00000001,2
10.1109/infvis.1995.528685,2
10.1109/infvis.1995.528687,2
10.1109/infvis.1997.636761,2
10.1109/infvis.1997.636790,2
...,...
10.1109/vast50239.2020.00014,2
10.1109/vast50239.2020.00015,2
10.1109/visual.1998.745338,2
10.1109/visual.2000.885720,2


In [11]:
matches[matches['vispub_doi'] == '10.1109/infvis.1995.528685']

Unnamed: 0,vispub_doi,aminer_id,method
2095,10.1109/infvis.1995.528685,558a4774e4b0b32fcb35e104,title
3563,10.1109/infvis.1995.528685,635bcab890e50fcafd33d13b,doi


### Get Aminer paper details

In [13]:
%%time
papers = dict()
count, match_ids = 0, 0
with open("dblp_v14.json", encoding="utf8") as infile:
    for line in infile:
        line = line.strip().strip(',').strip("]'")
        if line[0] != '{':
            continue
        paper = json.loads(line)
        lower = re.sub(r'[^\w]', '', str(paper['title']).lower())
        
        # First, try to match by paper DOI
        if paper['id'] in ids:
            papers[paper['id']] = paper
            match_ids += 1
            
        count += 1 
        if (count % 1000000 == 0):
            print(count, match_ids)
    print(count, match_ids)

1000000 917
2000000 1727
3000000 2633
4000000 3017
5000000 3448
5259858 3621
Wall time: 6min 28s


In [14]:
len(ids), len(papers)

(3621, 3621)

In [15]:
with open('results/aminer_match_papers.p', 'wb') as fp:
    pickle.dump(papers, fp, protocol=pickle.HIGHEST_PROTOCOL)

### Load Aminer paper details data

In [None]:
with open('results/aminer_match_papers.p', 'rb') as fp:
    papers = pickle.load(fp)
len(papers)

In [17]:
# Check the detail of papers with multiple Aminer ids
papers['558a4774e4b0b32fcb35e104']

{'id': '558a4774e4b0b32fcb35e104',
 'title': 'The Information Mural: a technique for displaying and navigating large information spaces',
 'doi': '10.1109/2945.722299',
 'issue': '3',
 'keywords': ['software visualization',
  'antialiasing',
  'search task',
  'two-dimensional reduced representation',
  'geographic information',
  'scientific data',
  'user interfaces',
  'information bandwidth',
  'browsing',
  'antialiased compression',
  'information retrieval',
  'display screen',
  'object-oriented programs',
  'large information space navigation',
  'color graphics',
  'global views',
  'color',
  'information mural',
  'text documents',
  'grayscale shading',
  'visual programming',
  'information space browsing',
  'information display',
  'data visualisation',
  'object-oriented programming',
  'information navigation',
  'data visualization',
  'pixels',
  'antialiasing techniques',
  'information visualization',
  'Information Mural',
  'large information spaces',
  'gray-sc

In [18]:
papers['635bcab890e50fcafd33d13b']

{'id': '635bcab890e50fcafd33d13b',
 'title': 'The information mural: a technique for displaying and navigating large information spaces.',
 'doi': '10.1109/INFVIS.1995.528685',
 'issue': '',
 'keywords': [],
 'lang': '',
 'venue': {'raw': 'IEEE Visualization Conference'},
 'year': 1995,
 'n_citation': 322,
 'page_start': '43',
 'page_end': '50',
 'volume': '',
 'issn': '',
 'isbn': '',
 'url': ['https://doi.org/10.1109/INFVIS.1995.528685'],
 'abstract': '',
 'authors': [{'id': '', 'name': 'Dean F. Jerding', 'org': ''},
  {'id': '53f43035dabfaee43ebe943d', 'name': 'John T. Stasko', 'org': ''}],
 'doc_type': 'Conference'}

Internal references

In [19]:
df = pd.read_csv("../vispubdata-update/vispubdata-update.csv", keep_default_na=False)
df['doi'] = df["DOI"].apply(lambda x: str(x).lower())
years = df.set_index('DOI')['Year'].to_dict()

In [20]:
cites = list()
years = dict()
for key, value in papers.items():
    years[key] = value['year']
    if 'references' in value:
        for i in value['references']:
            cites.append([key, value['year'], i])
cites = pd.DataFrame(cites, columns=['paper', 'paper_year', 'reference'])

In [21]:
cites

Unnamed: 0,paper,paper_year,reference
0,53e9be57b7602d9704b1803a,1993,557ec040d19faf961d16c238
1,53e9be57b7602d9704b1803a,1993,5582e4000cf2bb52dc3e032d
2,53e9be57b7602d9704b1803a,1993,53e99ba3b7602d970244ca1d
3,53e9be57b7602d9704b1803a,1993,53e99bb2b7602d970245d252
4,53e9be57b7602d9704b1803a,1993,53e9a8d3b7602d97032219fa
...,...,...,...
57952,633c10ea90e50fcafdc35542,2015,53e99a98b7602d9702313574
57953,633c10ea90e50fcafdc35670,2015,53e9b661b7602d97041c60bc
57954,633c10ea90e50fcafdc35670,2015,53e99f64b7602d9702838cc0
57955,633c10ea90e50fcafdc35670,2015,53e99a04b7602d9702250b0b


In [22]:
# Filter only internal citations
cites = cites[cites['reference'].isin(ids)].copy()
cites['reference_year'] = cites['reference'].replace(years)

In [23]:
# Remove self citation
cites = cites[cites['paper'] != cites['reference']]

In [24]:
# Filter only citations that come after the paper year
cites[cites['reference_year'] > cites['paper_year']]

Unnamed: 0,paper,paper_year,reference,reference_year
548,53e99827b7602d970204987c,1997,53e99b5db7602d97024026b0,1998
10980,53e9a73bb7602d9703071886,2007,53e9be3cb7602d9704af7b3c,2008
12321,53e9a9f7b7602d970335d8b7,2006,53e9bb67b7602d97047aa045,2007
33983,558aeafbe4b037c08759f29e,2007,558c855984ae6766fdf3898f,2010
34105,558aeafde4b037c08759f2ac,2007,53e9ae42b7602d9703859dbe,2009
47984,5c2c7a9217c44a4e7cf31980,2018,5c8dbfa34895d9cbc69bd7cd,2019
51344,5e63725b91e011ae97a69e48,2019,5d63ae47ed7e9c7926557db4,2020


In [25]:
# Replace with vispubdata DOIs
cites = cites.replace(dois)

cites['paper_year'] = cites['paper'].copy().replace(years)
cites['reference_year'] = cites['reference'].copy().replace(years)

In [26]:
# Remove citation in the future
cites = cites[cites['reference_year'] <= cites['paper_year']]

In [27]:
c = cites.groupby('paper').agg({'reference': 'unique'}).reset_index()
c['reference'] = c['reference'].apply(lambda x: ';'.join(x))
c = c.set_index('paper')['reference'].to_dict()

### Merge into vispubdata dataset

In [28]:
df = pd.read_csv("../vispubdata-update/vispubdata-update.csv", keep_default_na=False)

In [29]:
count = 0
for index, row in df.iterrows():
    if not row['DOI'] in list(matches['vispub_doi']):
        continue
    
    # Citation count
    m_ids = list(matches.loc[matches['vispub_doi'] == row['DOI'], 'aminer_id'])
    n_cites = max([papers[x]['n_citation'] for x in m_ids])
    df.loc[index, 'AminerCitationCount'] = n_cites
    
    # Internal references
    #if row['DOI'] in c:
    #    df.loc[index, 'internal_references'] = c[row['DOI']]
    
    count += 1

In [30]:
df

Unnamed: 0,Conference,Year,Title,DOI,Link,FirstPage,LastPage,PaperType,Abstract,AuthorNames-Deduped,AuthorNames,AuthorAffiliation,InternalReferences,AuthorKeywords,AminerCitationCount,CitationCount_CrossRef,PubsCited_CrossRef,Downloads_Xplore,Award,GraphicsReplicabilityStamp
0,Vis,2022,Photosensitive Accessibility for Interactive D...,10.1109/tvcg.2022.3209359,http://dx.doi.org/10.1109/TVCG.2022.3209359,374,384,J,Accessibility guidelines place restrictions on...,Laura South;Michelle A. Borkin,Laura South;Michelle A. Borkin,"Northeastern University, USA;Northeastern Univ...",0.1109/tvcg.2011.185;10.1109/tvcg.2021.3114829...,"accessibility,photosensitive epilepsy,photosen...",,4,63,554,,
1,Vis,2022,HetVis: A Visual Analysis Approach for Identif...,10.1109/tvcg.2022.3209347,http://dx.doi.org/10.1109/TVCG.2022.3209347,310,319,J,Horizontal federated learning (HFL) enables di...,Xumeng Wang;Wei Chen 0001;Jiazhi Xia;Zhen Wen;...,Xumeng Wang;Wei Chen;Jiazhi Xia;Zhen Wen;Rongc...,"TMCC, CS, Nankai University, China;State Key L...",0.1109/tvcg.2015.2467618;10.1109/tvcg.2019.293...,"Federated learning,data heterogeneity,cluster ...",,10,43,984,,
2,Vis,2022,Rigel: Transforming Tabular Data by Declarativ...,10.1109/tvcg.2022.3209385,http://dx.doi.org/10.1109/TVCG.2022.3209385,128,138,J,"We present Rigel, an interactive system for ra...",Ran Chen;Di Weng;Yanwei Huang;Xinhuan Shu;Jiay...,Ran Chen;Di Weng;Yanwei Huang;Xinhuan Shu;Jiay...,"State Key Lab of CAD&CG, Zhejiang University, ...",0.1109/tvcg.2021.3114830;10.1109/vast47406.201...,"Data transformation,self-service data transfor...",,6,68,610,,
3,Vis,2022,BeauVis: A Validated Scale for Measuring the A...,10.1109/tvcg.2022.3209390,http://dx.doi.org/10.1109/TVCG.2022.3209390,363,373,J,We developed and validated a rating scale to a...,Tingying He;Petra Isenberg;Raimund Dachselt;To...,Tingying He;Petra Isenberg;Raimund Dachselt;To...,"Université Paris-Saclay, CNRS, Inria, LISN, Fr...",0.1109/infvis.2005.1532128;10.1109/tvcg.2006.1...,"Aesthetics,aesthetic pleasure,validated scale,...",,7,79,753,,X
4,Vis,2022,NAS-Navigator: Visual Steering for Explainable...,10.1109/tvcg.2022.3209361,http://dx.doi.org/10.1109/TVCG.2022.3209361,299,309,J,The success of DL can be attributed to hours o...,Anjul Kumar Tyagi;Cong Xie;Klaus Mueller 0001,Anjul Tyagi;Cong Xie;Klaus Mueller,"Computer Science Department, Visual Analytics ...",0.1109/vast.2012.6400490;10.1109/tvcg.2019.293...,"Deep Learning,Neural Network Architecture Sear...",,0,63,391,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3748,Vis,2023,Extract and Characterize Hairpin Vortices in T...,10.1109/tvcg.2023.3326603,http://dx.doi.org/10.1109/TVCG.2023.3326603,716,726,J,Hairpin vortices are one of the most important...,Adeel Zafar;Di Yang;Guoning Chen,Adeel Zafar;Di Yang;Guoning Chen,"University of Houston, USA;University of Houst...",10.1109/visual.1994.346327;10.1109/tvcg.2018.2...,"Turbulent flow,vortices,hairpin vortex extraction",,0,55,165,HM,
3749,Vis,2023,Causality-Based Visual Analysis of Questionnai...,10.1109/tvcg.2023.3327376,http://dx.doi.org/10.1109/TVCG.2023.3327376,638,648,J,"As the final stage of questionnaire analysis, ...",Renzhong Li;Weiwei Cui;Tianqi Song;Xiao Xie;Ru...,Renzhong Li;Weiwei Cui;Tianqi Song;Xiao Xie;Ru...,"State Key Lab of CAD&CG, Zhejiang University, ...",10.1109/tvcg.2021.3114875;10.1109/tvcg.2022.32...,"Causal analysis,Questionnaire,Design study",,0,44,304,,
3750,Vis,2023,Quantivine: A Visualization Approach for Large...,10.1109/tvcg.2023.3327148,http://dx.doi.org/10.1109/TVCG.2023.3327148,573,583,J,Quantum computing is a rapidly evolving field ...,Zhen Wen;Yihan Liu;Siwei Tan;Jieyi Chen;Minfen...,Zhen Wen;Yihan Liu;Siwei Tan;Jieyi Chen;Minfen...,"State Key Lab of CAD&CG, Zhejiang University, ...",10.1109/tvcg.2018.2865139;10.1109/infvis.2004....,"Quantum circuit,semantic analysis,visual abstr...",,0,83,196,,
3751,Vis,2023,The Arrangement of Marks Impacts Afforded Mess...,10.1109/tvcg.2023.3326590,http://dx.doi.org/10.1109/TVCG.2023.3326590,1008,1018,J,Data visualizations present a massive number o...,Racquel Fygenson;Steven Franconeri;Enrico Bertini,Racquel Fygenson;Steven Franconeri;Enrico Bertini,"Northeastern University, USA;Northeastern Univ...",10.1109/tvcg.2012.197;10.1109/tvcg.2013.234;10...,"Perception & cognition,Methodologies,Human-sub...",,0,67,194,,


In [32]:
#df.to_csv('results/vispubdata_citation.csv', index=False)

In [33]:
df.to_csv("../vispubdata-update/vispubdata-update.csv", index=False)