In [1]:
import pandas as pd

authors_papers_df = pd.read_csv("files/ic2s2_papers.csv")
authors_abstracts_df = pd.read_csv("files/ic2s2_abstracts.csv")
coauthors_papers_df = pd.read_csv("files/ic2s2_coauthors_papers.csv")
coauthors_abstracts_df = pd.read_csv("files/ic2s2_coauthors_abstracts.csv")

In [3]:
print("Filtered authors papers:", authors_papers_df.shape)
print("authors abstracts:", authors_abstracts_df.shape)
print("Filtered co-authors papers:", coauthors_papers_df.shape)
print("Co-authors abstracts:", coauthors_abstracts_df.shape)

Filtered authors papers: (831, 4)
authors abstracts: (831, 3)
Filtered co-authors papers: (35160, 4)
Co-authors abstracts: (35160, 3)


In [4]:
# (A) Combine authors: concatenate IC2S2 authors and co-authors dataframes and remove duplicates.
authors_df = pd.read_csv("files/researchers_data_2024.csv")
coauthors_df = pd.read_csv("files/ic2s2_coauthors.csv")

combined_authors_df = pd.concat([authors_df, coauthors_df], ignore_index=True)
combined_authors_df.drop_duplicates(subset=["ID"], inplace=True)
combined_authors_df.to_csv("files/authors_combined.csv", index=False)
print("Combined authors dataset shape:", combined_authors_df.shape)

Combined authors dataset shape: (12481, 6)


In [5]:
import ast

# (B) Combine papers: concatenate IC2S2 papers and co-authors papers, drop duplicate works,
#     and remove papers with only one author.
# Note: If your "author_ids" column is stored as a string (e.g., as a list repr), convert it back to list.
def parse_author_ids(author_ids):
    if isinstance(author_ids, str):
        try:
            return ast.literal_eval(author_ids)
        except Exception:
            return []
    return author_ids

# Ensure the author_ids column is in list format for each paper.
for df in [authors_papers_df, coauthors_papers_df]:
    if df["author_ids"].dtype == object:
        df["author_ids"] = df["author_ids"].apply(parse_author_ids)

In [6]:
combined_papers_df = pd.concat([authors_papers_df, coauthors_papers_df], ignore_index=True)
combined_papers_df.drop_duplicates(subset=["id"], inplace=True)
# Remove papers with only one author
combined_papers_df = combined_papers_df[combined_papers_df["author_ids"].apply(lambda x: len(x) > 1)]
combined_papers_df.to_csv("files/papers_combined.csv", index=False)
print("Combined papers dataset shape:", combined_papers_df.shape)

Combined papers dataset shape: (5443, 4)


In [9]:
# (C) Combine abstracts: simply concatenate and drop duplicates.
combined_abstracts_df = pd.concat([authors_abstracts_df, coauthors_abstracts_df], ignore_index=True)
print(combined_abstracts_df.shape)
combined_abstracts_df.drop_duplicates(subset=["id"], inplace=True)
combined_abstracts_df.to_csv("files/abstracts_combined.csv", index=False)
print("Combined abstracts dataset shape:", combined_abstracts_df.shape)

(35991, 3)
Combined abstracts dataset shape: (5443, 3)
