# 0. Log In and Setup

In [None]:
import openreview
import pandas as pd

client_v2 = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username='',
    password=''
)

In [None]:
venue_name = '' # E.g. AAAI26
venue_id = ''

path_to_local_scores_dir = ''

# 1. Request job

In [None]:
job_id = client_v2.request_paper_similarity(
    name=f'{venue_name}-Paper-Similarity',
    venue_id=f'{venue_id}/Submission',
    alternate_venue_id=f'{venue_id}/Submission',
    model='specter2+scincl'
)

In [None]:
job_id

## 1a. Check status
Wait until complete

In [None]:
res = client_v2.get_expertise_status(job_id='')

## 1b. Find job ID if you lost it
Get all jobs based on status and find your job based on the name

In [None]:
jobs = client_v2.get_expertise_jobs(status='Complete')

In [None]:
for j in jobs['results']:
    if 'AAAI' in j['name']:
        print(j['jobId'], j['name'])

# Build similarity score CSV

- CSV contains: paper IDs, scores, titles, abstracts, author lists, overlapping authors (optional) -- for all paper pairs
-----

## 2. Retrieve scores and convert to CSV
Change job_id

In [None]:
# Could take 5-10min
results = client_v2.get_expertise_results(job_id='')

#### Convert results to CSV
Change CSV name if needed

In [None]:
results_csv = f'{path_to_local_scores_dir}/{venue_name}-Similarity-Scores-Sparse.csv'
pd.DataFrame.from_records(results['results']).to_csv(results_csv, index=False)

## 3. Read score file, create paper ID & score matrix
Change score file name if needed

In [None]:
import csv
score_file = results_csv
paper_id_score_matrix = {}

with open(score_file, 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader) # skip header
    for row in csv_reader:
        paper_id_1 = row[0]
        paper_id_2 = row[1]
        score = float(row[2])

        # Create matrix, map scores
        if paper_id_1 not in paper_id_score_matrix:
            paper_id_score_matrix[paper_id_1] = {}
        
        paper_id_score_matrix[paper_id_1][paper_id_2] = score

## 4. Get papers, map paper ID to submission note

In [None]:
# Get all submissions
subs = client_v2.get_all_notes(invitation=f'{venue_id}/-/Submission')

In [None]:
paper_id_map = {}
for note in subs:
    paper_id_map[note.id] = note

## 5. Map papers to author profiles

### 5a. Get all authors, convert to profiles

In [None]:
all_authors = set()

for note in subs:
    author_list = paper_id_map[note.id].content['authorids']['value']
    for a in author_list:
        all_authors.add(a)

In [None]:
len(all_authors)

In [None]:
all_author_profiles = openreview.tools.get_profiles(client_v2, all_authors)

### 5b. Map ALL usernames in ALL author profiles to their profile.id
To normalize author IDs to profile IDs

All IDs should belong to 1 profile.

- 1 ID may map to multiple profiles -- Merge.
- 1 ID can appear in 1 profile multiple times -- Ignore.

In [None]:
username_to_id = {}
flagged_profiles = []
merge_profiles = []

for profile in all_author_profiles:
    names = [n.get('username', '') for n in profile.content['names']]
    for name in names:
        if name:
            if name in username_to_id:
                flagged_profiles.append(name)
            username_to_id[name] = profile.id

if flagged_profiles:
    print('Profile anomalies found. Checking...')
    for flagged_id in flagged_profiles:
        profiles = openreview.tools.get_profiles(client_v2, [flagged_id])
        if len(profiles) > 1:
            merge_profiles.append(flagged_id)

if merge_profiles:
    print(f"Merge the following profiles before continuing, then re-run script: {merge_profiles}")
else:
    print('Profiles look good, safe to continue.')

### 5c. Map paper IDs to author profile IDs
If ID is not in map, then profile is probably blocked or deleted. So we keep original author ID from submission.

In [None]:
paper_authorids_map = {}

for note in subs:
    author_list = paper_id_map[note.id].content['authorids']['value']
    
    if note.id not in paper_authorids_map:
        paper_authorids_map[note.id] = []
    
    for a in author_list:
        if a in username_to_id:
            # Add author profile ID
            paper_authorids_map[note.id].append(username_to_id[a])
        else:
            # Add ID from paper if no profile is found
            paper_authorids_map[note.id].append(a)

## 6. Find overlapping authors

In [None]:
overlapping_author_map = {}

for paper_id_1, paper_id_2_map in paper_id_score_matrix.items():
    paper_id_1_author_set = set(paper_authorids_map[paper_id_1])

    for paper_id_2 in paper_id_2_map.keys():
        paper_id_2_author_set = set(paper_authorids_map[paper_id_2])

        overlapping_authors = paper_id_1_author_set & paper_id_2_author_set

        if overlapping_authors:
            # Key is tuple of both IDs from each venue
            overlapping_author_map[(paper_id_1, paper_id_2)] = overlapping_authors

In [None]:
# Check how many paper pair combos have overlapping authors
len(overlapping_author_map.keys())

## 7. Create final CSV file

### 7a. Set up column names

In [None]:
col1_name = f'{venue_name}-1'
col2_name = f'{venue_name}-2'

### 7b. For overlapping authors
Change file name if needed

In [None]:
new_file_name_overlap = f'{path_to_local_scores_dir}/{venue_name}-Paper-Similarity-Sparse-Overlap.csv'

with open(new_file_name_overlap, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    csvwriter.writerow([f'{col1_name} id', f'{col2_name} id', 'Score', 'Matching Authors', f'{col1_name} authors', f'{col2_name} authors', f'{col1_name} title', f'{col2_name} title', f'{col1_name} abstract', f'{col2_name} abstract'])
    
    for paper_pair in overlapping_author_map.keys():
        paper_id_1, paper_id_2 = paper_pair
        score = paper_id_score_matrix[paper_id_1][paper_id_2]
        
        paper_title_1 = paper_id_map[paper_id_1].content['title']['value']
        paper_title_2 = paper_id_map[paper_id_2].content['title']['value']

        paper_abstract_1 = paper_id_map[paper_id_1].content['abstract']['value'].replace("\n", "\\n")
        paper_abstract_2 = paper_id_map[paper_id_2].content['abstract']['value'].replace("\n", "\\n")

        paper_authors_1 = '|'.join(paper_authorids_map[paper_id_1])
        paper_authors_2 = '|'.join(paper_authorids_map[paper_id_2])

        paper_author_overlap_str = '|'.join(overlapping_author_map[paper_pair])

        row = [paper_id_1, paper_id_2, score, paper_author_overlap_str, paper_authors_1, paper_authors_2, paper_title_1, paper_title_2, paper_abstract_1, paper_abstract_2]
        
        csvwriter.writerow(row)

### 7c. For non-overlapping authors
Change file name if needed

Creates a VERY large file, ~70GB if 1 venue is large

Can take ~15min

In [None]:
new_file_name_no_overlap = f'{path_to_local_scores_dir}/{venue_name}-Paper-Similarity-Sparse-No-Overlap.csv'

with open(new_file_name_no_overlap, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    csvwriter.writerow([f'{col1_name} id', f'{col2_name} id', 'Score', 'Matching Authors (if any)', f'{col1_name} authors', f'{col2_name} authors', f'{col1_name} title', f'{col2_name} title', f'{col1_name} abstract', f'{col2_name} abstract'])
    
    for paper_id_1, paper_id_2_map in paper_id_score_matrix.items():
        paper_title_1 = paper_id_map[paper_id_1].content['title']['value']
        paper_abstract_1 = paper_id_map[paper_id_1].content['abstract']['value'].replace("\n", "\\n")
        paper_authors_1 = '|'.join(paper_authorids_map[paper_id_1])

        for paper_id_2 in paper_id_2_map.keys():
            score = paper_id_score_matrix[paper_id_1][paper_id_2]
            
            paper_title_2 = paper_id_map[paper_id_2].content['title']['value']
            paper_abstract_2 = paper_id_map[paper_id_2].content['abstract']['value'].replace("\n", "\\n")
            paper_authors_2 = '|'.join(paper_authorids_map[paper_id_2])

            # Include overlapping authors if they exist, otherwise display "None"
            paper_author_overlap = overlapping_author_map.get((paper_id_1, paper_id_2), [])
            paper_author_overlap_str = '|'.join(paper_author_overlap) if paper_author_overlap else 'None'

            row = [paper_id_1, paper_id_2, score, paper_author_overlap_str, paper_authors_1, paper_authors_2, paper_title_1, paper_title_2, paper_abstract_1, paper_abstract_2]
            
            csvwriter.writerow(row)

## 8. Remove dupes and sort
Original score file contains:
- Papers matched with itself: (ID1, ID1, 1.0)
- Paper pair duplicates: (ID1, ID2, 0.45) and (ID2, ID1, 0.45)
    - Same ID comparison, same score

In [None]:
file_path = new_file_name_overlap

# Read CSV
df = pd.read_csv(file_path)

# 1. Remove self-matches (ID1 == ID2)
df = df[df[f"{col1_name} id"] != df[f"{col2_name} id"]].copy()

# 2. Create a normalized pair (order-independent identifier)
df["pair"] = df.apply(
    lambda row: tuple(sorted([row[f"{col1_name} id"], row[f"{col2_name} id"]])), axis=1
)

# 3. Remove mirrored duplicates (A,B) vs (B,A), keeping the first
df = df.drop_duplicates(subset=["pair"], keep="first")

# 4. Clean main df (drop helper column, keep everything else)
df = df.drop(columns=["pair"])

# 5. Sort by score
df_sorted = df.sort_values(by="Score", ascending=False)

# 6. Save
df_sorted.to_csv(file_path, index=False)
