# 0. Log In and Setup
Change credentials and venue details

In [None]:
import openreview
import pandas as pd

client_v2 = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username='',
    password=''
)

In [None]:
# Venue in column 1
venue_A_name = '' # E.g. AAAI26
venue_A_id = ''

# Venue in column 2
venue_B_name = ''
venue_B_id = ''

path_to_local_scores_dir = ''

# 1. Request job
Change request as needed

In [None]:
job_id = client_v2.request_paper_similarity(
    name=f'{venue_A_name}-{venue_B_name}-Paper-Similarity',
    venue_id=f'{venue_A_id}/Submission',
    alternate_venue_id=f'{venue_B_id}/Submission',
    model='specter2+scincl'
)

In [None]:
job_id

## 1a. Check status
Wait until complete

In [None]:
res = client_v2.get_expertise_status(job_id='')

## 1b. Find job ID
Get all jobs based on status and find your job based on the name

In [None]:
jobs = client_v2.get_expertise_jobs(status='Complete')

In [None]:
for j in jobs['results']:
    if 'AAAI' in j['name']:
        print(j['jobId'], j['name'])

# Build similarity score CSV

- CSV contains: paper IDs, scores, titles, abstracts, author lists, overlapping authors (optional) -- for both venues
-----

## 2. Retrieve scores and convert to CSV
Change job_id

In [None]:
# Could take 5-10min
results = client_v2.get_expertise_results(job_id='')

### Convert results to CSV
Change CSV name if needed

In [None]:
results_csv = f'{path_to_local_scores_dir}/{venue_A_name}-{venue_B_name}-Similarity-Scores-Sparse.csv'
pd.DataFrame.from_records(results['results']).to_csv(results_csv, index=False)

## 3. Read score file, create paper ID matrix, map scores
Change score file name if needed

In [None]:
import csv
score_file = results_csv
paper_id_score_matrix = {}

# Read the results CSV from #2
with open(score_file, 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader) # skip header
    for row in csv_reader:
        venue_A_paper_id = row[0]
        venue_B_paper_id = row[1]
        score = float(row[2])

        # Create matrix, map scores
        if venue_A_paper_id not in paper_id_score_matrix:
            paper_id_score_matrix[venue_A_paper_id] = {}
        
        paper_id_score_matrix[venue_A_paper_id][venue_B_paper_id] = score

## 4. Get venue A papers, map paper ID to submission note

In [None]:
# Get all submissions
venue_A_subs = client_v2.get_all_notes(invitation=f'{venue_A_id}/-/Submission')

In [None]:
venue_A_paperid_map = {}
for note in venue_A_subs:
    venue_A_paperid_map[note.id] = note

## 5. Map venue A papers to author profiles

### 5a. Get all authors, convert to profiles

In [None]:
venue_A_all_authors = set()

for note in venue_A_subs:
    author_list = venue_A_paperid_map[note.id].content['authorids']['value']
    for a in author_list:
        venue_A_all_authors.add(a)

In [None]:
len(venue_A_all_authors)

In [None]:
venue_A_all_author_profiles = openreview.tools.get_profiles(client_v2, venue_A_all_authors)

### 5b. Map ALL usernames in ALL author profiles to their profile.id
To normalize author IDs to profile IDs

All IDs should belong to 1 profile.

- 1 ID may map to multiple profiles -- Merge.
- 1 ID can appear in 1 profile multiple times -- Ignore.

In [None]:
username_to_id = {}
flagged_profiles = []
merge_profiles = []

for profile in venue_A_all_author_profiles:
    names = [n.get('username', '') for n in profile.content['names']]
    for name in names:
        if name:
            if name in username_to_id:
                flagged_profiles.append(name)
            username_to_id[name] = profile.id

if flagged_profiles:
    print('Profile anomalies found. Checking...')
    for flagged_id in flagged_profiles:
        profiles = openreview.tools.get_profiles(client_v2, [flagged_id])
        if len(profiles) > 1:
            merge_profiles.append(flagged_id)

if merge_profiles:
    print(f"Merge the following profiles before continuing, then re-run script: {merge_profiles}")
else:
    print('Profiles look good, safe to continue.')

### 5c. Map paper IDs to author profile IDs
If ID is not in map, then profile is probably blocked or deleted. So we keep original author ID from submission.

In [None]:
venue_A_paper_authorids_map = {}

for note in venue_A_subs:
    author_list = venue_A_paperid_map[note.id].content['authorids']['value']
    
    if note.id not in venue_A_paper_authorids_map:
        venue_A_paper_authorids_map[note.id] = []
    
    for a in author_list:
        if a in username_to_id:
            # Add author profile ID
            venue_A_paper_authorids_map[note.id].append(username_to_id[a])
        else:
            # Add ID from paper if no profile is found
            venue_A_paper_authorids_map[note.id].append(a)

## 6. Get venue B papers, map paper ID to submission note

In [None]:
# Get all submissions
venue_B_subs = client_v2.get_all_notes(invitation=f'{venue_B_id}/-/Submission')

In [None]:
len(venue_B_subs)

In [None]:
venue_B_paperid_map = {}
for note in venue_B_subs:
    venue_B_paperid_map[note.id] = note

## 7. Map venue B papers to author profiles

### 7a. Get all authors, convert to profiles

In [None]:
venue_B_all_authors = set()

for note in venue_B_subs:
    author_list = venue_B_paperid_map[note.id].content['authorids']['value']
    for a in author_list:
        venue_B_all_authors.add(a)

In [None]:
len(venue_B_all_authors)

In [None]:
venue_B_all_author_profiles = openreview.tools.get_profiles(client_v2, venue_B_all_authors)

### 7b. Map ALL usernames in ALL author profiles to their profile.id
To normalize author IDs to profile IDs

All IDs should belong to 1 profile.

- 1 ID may map to multiple profiles -- Merge.
- 1 ID can appear in 1 profile multiple times -- Ignore.

In [None]:
username_to_id = {}
flagged_profiles = []
merge_profiles = []

for profile in venue_B_all_author_profiles:
    names = [n.get('username', '') for n in profile.content['names']]
    for name in names:
        if name:
            if name in username_to_id:
                flagged_profiles.append(name)
            username_to_id[name] = profile.id

if flagged_profiles:
    print('Profile anomalies found. Checking...')
    for flagged_id in flagged_profiles:
        profiles = openreview.tools.get_profiles(client_v2, [flagged_id])
        if len(profiles) > 1:
            merge_profiles.append(flagged_id)

if merge_profiles:
    print(f"Merge the following profiles before continuing, then re-run script: {merge_profiles}")
else:
    print('Profiles look good, safe to continue.')

### 7c. Map paper IDs to author profile IDs
If ID is not in map, then profile is probably blocked or deleted. So we keep original author ID from submission.

In [None]:
venue_B_paper_authorids_map = {}

for note in venue_B_subs:
    author_list = venue_B_paperid_map[note.id].content['authorids']['value']
    
    if note.id not in venue_B_paper_authorids_map:
        venue_B_paper_authorids_map[note.id] = []
    
    for a in author_list:
        if a in username_to_id:
            # Add author profile ID
            venue_B_paper_authorids_map[note.id].append(username_to_id[a])
        else:
            # Add ID from paper if no profile is found
            venue_B_paper_authorids_map[note.id].append(a)

## 8. Find overlapping authors

In [None]:
overlapping_author_map = {}

for venue_A_paper_id, venue_B_paper_ids_map in paper_id_score_matrix.items():
    venue_A_paper_author_set = set(venue_A_paper_authorids_map[venue_A_paper_id])

    for venue_B_paper_id in venue_B_paper_ids_map.keys():
        venue_B_paper_author_set = set(venue_B_paper_authorids_map[venue_B_paper_id])

        overlapping_authors = venue_A_paper_author_set & venue_B_paper_author_set

        if overlapping_authors:
            # Key is tuple of both IDs from each venue
            overlapping_author_map[(venue_A_paper_id, venue_B_paper_id)] = overlapping_authors

In [None]:
# Check how many paper pair combos have overlapping authors
len(overlapping_author_map.keys())

## 9. Create final CSV file

### 9a. For overlapping authors
Change file name if needed

In [None]:
new_file_name_overlap = f'{path_to_local_scores_dir}/{venue_A_name}-{venue_B_name}-Paper-Similarity-Sparse-Overlap.csv'

with open(new_file_name_overlap, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    csvwriter.writerow([f'{venue_A_name} id', f'{venue_B_name} id', 'Score', 'Matching Authors', f'{venue_A_name} authors', f'{venue_B_name} authors', f'{venue_A_name} title', f'{venue_B_name} title', f'{venue_A_name} abstract', f'{venue_B_name} abstract'])
    
    for paper_pair in overlapping_author_map.keys():
        venue_A_paper_id, venue_B_paper_id = paper_pair
        score = paper_id_score_matrix[venue_A_paper_id][venue_B_paper_id]

        venue_A_paper_title = venue_A_paperid_map[venue_A_paper_id].content['title']['value']
        venue_B_paper_title = venue_B_paperid_map[venue_B_paper_id].content['title']['value']

        venue_A_paper_abstract = venue_A_paperid_map[venue_A_paper_id].content['abstract']['value'].replace("\n", "\\n")
        venue_B_paper_abstract = venue_B_paperid_map[venue_B_paper_id].content['abstract']['value'].replace("\n", "\\n")

        venue_A_paper_authors = '|'.join(venue_A_paper_authorids_map[venue_A_paper_id])
        venue_B_paper_authors = '|'.join(venue_B_paper_authorids_map[venue_B_paper_id])

        paper_author_overlap_str = '|'.join(overlapping_author_map[paper_pair])

        row = [venue_A_paper_id, venue_B_paper_id, score, paper_author_overlap_str, venue_A_paper_authors, venue_B_paper_authors, venue_A_paper_title, venue_B_paper_title, venue_A_paper_abstract, venue_B_paper_abstract]
        
        csvwriter.writerow(row)

### 9b. For non-overlapping authors
Change file name if needed

Creates a VERY large file, ~70GB if 1 venue is large

Can take ~15min

In [None]:
new_file_name_no_overlap = f'{path_to_local_scores_dir}/{venue_A_name}-{venue_B_name}-Paper-Similarity-Sparse-No-Overlap.csv'

with open(new_file_name_no_overlap, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    csvwriter.writerow([f'{venue_A_name} id', f'{venue_B_name} id', 'Score', 'Matching Authors (if any)', f'{venue_A_name} authors', f'{venue_B_name} authors', f'{venue_A_name} title', f'{venue_B_name} title', f'{venue_A_name} abstract', f'{venue_B_name} abstract'])
    
    for venue_A_paper_id, venue_B_paper_ids_map in paper_id_score_matrix.items():
        venue_A_paper_title = venue_A_paperid_map[venue_A_paper_id].content['title']['value']
        venue_A_paper_abstract = venue_A_paperid_map[venue_A_paper_id].content['abstract']['value'].replace("\n", "\\n")
        venue_A_paper_authors = '|'.join(venue_A_paper_authorids_map[venue_A_paper_id])

        for venue_B_paper_id in venue_B_paper_ids_map.keys():
            score = paper_id_score_matrix[venue_A_paper_id][venue_B_paper_id]
            
            venue_B_paper_title = venue_B_paperid_map[venue_B_paper_id].content['title']['value']
            venue_B_paper_abstract = venue_B_paperid_map[venue_B_paper_id].content['abstract']['value'].replace("\n", "\\n")
            venue_B_paper_authors = '|'.join(venue_B_paper_authorids_map[venue_B_paper_id])

            # Include overlapping authors if they exist, otherwise display "None"
            paper_author_overlap = overlapping_author_map.get((venue_A_paper_id, venue_B_paper_id), [])
            paper_author_overlap_str = '|'.join(paper_author_overlap) if paper_author_overlap else 'None'

            row = [venue_A_paper_id, venue_B_paper_id, score, paper_author_overlap_str, venue_A_paper_authors, venue_B_paper_authors, venue_A_paper_title, venue_B_paper_title, venue_A_paper_abstract, venue_B_paper_abstract]
            
            csvwriter.writerow(row)

## 10. Sort by score

In [None]:
file_path = new_file_name_overlap

# Read CSV
df = pd.read_csv(file_path)

# Sort by score
df_sorted = df.sort_values(by='Score', ascending=False)

# Save
df_sorted.to_csv(file_path, index=False)