In [None]:
import re
import time
import os
filepath = r"datasets\cora\cora.txt"

with open(filepath) as f:
    papers = f.read()

raw_text = papers

# Split rows by newline
rows = [line for line in raw_text.split("\n") if line.strip()]

author_lists = []

for row in rows:
    parts = row.split("\t")
    if len(parts) < 3:
        continue

    raw_authors = parts[2]

    # Clean spacing, remove trailing periods
    cleaned = raw_authors.replace("&", ",")  # treat & as comma
    cleaned = re.sub(r"\band\b", ",", cleaned, flags=re.I)  # replace 'and' with comma
    cleaned = cleaned.replace(".", "")  # remove periods
    cleaned = cleaned.replace("  ", " ")

    # Split into authors on commas
    authors = [a.strip() for a in cleaned.split(",") if a.strip()]

    author_lists.append(authors)

# Print result
for i, authors in enumerate(author_lists, 1):
    print(f"Paper {i}: {authors}")



In [None]:
filepath = r"datasets\cora\cora_modified.txt"

with open(filepath) as f:
    papers = f.read()

raw_text = papers

def is_abbreviated_author(author):
    """
    Returns True if the author looks abbreviated like:
    a. blum, m. furst, j. jackson, etc.
    """
    # Pattern: 1–2 letters + period + space + surname
    return bool(re.match(r"^[a-zA-Z]\.?[a-zA-Z]?\.?\s+[a-zA-Z].+", author.strip()))

def extract_authors(raw_authors):
    # Normalize separators
    txt = raw_authors.replace("&", ",")
    txt = re.sub(r"\band\b", ",", txt, flags=re.I)

    # Remove dots from ends, normalize spaces
    txt = txt.replace(".", ". ")
    txt = re.sub(r"\s+", " ", txt)

    # Split authors
    parts = [a.strip().strip(".") for a in txt.split(",") if a.strip()]
    return parts

papers = {}  # key → {'abbrev': [...], 'full': [...]}

for line in raw_text.splitlines():
    if not line.strip():
        continue
    parts = line.split("\t")
    if len(parts) < 3:
        continue

    key = parts[1].strip().strip('"')   # citation key
    authors_raw = parts[2]

    authors = extract_authors(authors_raw)

    # Determine if this is an "abbreviated" author version
    abbreviated = all(is_abbreviated_author(a) for a in authors)

    # Store best version per citation key
    if key not in papers:
        papers[key] = {"abbrev": None, "full": None}

    if abbreviated:
        papers[key]["abbrev"] = authors
    else:
        if papers[key]["full"] is None:
            papers[key]["full"] = authors

# Select the preferred version for each paper
final_author_lists = []

for key, versions in papers.items():
    if versions["abbrev"] is not None:
        final_author_lists.append(versions["abbrev"])
    elif versions["full"] is not None:
        final_author_lists.append(versions["full"])

all_authors = set()
# Print result
for i, authors in enumerate(final_author_lists, 1):
    print(f"Paper {i}: {authors}")
    all_authors.update(authors)

In [None]:
print(sorted(all_authors))
print(len(all_authors))

# save duplicate conversion list
duplicate_conversion = {}
duplicate_conversion['schapire'] = 'r. e. schapire'
duplicate_conversion['r. schapire'] = 'r. e. schapire'
duplicate_conversion['robert e. schapire'] = 'r. e. schapire'
duplicate_conversion['shapire'] = 'r. e. schapire'
duplicate_conversion['r. e'] = 'r. e. schapire'
duplicate_conversion['r. e.'] = 'r. e. schapire'
duplicate_conversion['r'] = 'r. e. schapire'
duplicate_conversion['haussler'] = 'd. haussler'
duplicate_conversion['d. helmbold'] = 'd. p. helmbold'
duplicate_conversion['helmbold'] = 'd. p. helmbold'
duplicate_conversion['d. p'] = 'd. p. helmbold'
duplicate_conversion['d'] = 'd. haussler'
duplicate_conversion['a. ng'] = 'a. y. ng'
duplicate_conversion['eric bauer'] = 'e. bauer'
duplicate_conversion['freund'] = 'y. freund'
duplicate_conversion['yoav freund'] = 'y. freund'
duplicate_conversion['h. kautz'] = 'h. a. kautz'
duplicate_conversion['druker'] = 'drucker'
duplicate_conversion['h. s seung'] = 'h. s. seung'
duplicate_conversion['h. sebastian seung'] = 'h. s. seung'
duplicate_conversion['h. seung'] = 'h. s. seung'
duplicate_conversion['l. valiant'] = 'l. g. valiant'
duplicate_conversion['l. sellie'] = 'l. m. sellie'
duplicate_conversion['littlestone'] = 'n. littlestone'
duplicate_conversion['m. kearns'] = 'm. j. kearns'
duplicate_conversion['michael kearns'] = 'm. j. kearns'
duplicate_conversion['michael j. kearns'] = 'm. j. kearns'
duplicate_conversion['m. warmuth'] = 'm. k. warmuth'
duplicate_conversion['warmuth'] = 'm. k. warmuth'
duplicate_conversion['m. k'] = 'm. k. warmuth'
duplicate_conversion['h. druker'] = 'h. drucker'
duplicate_conversion['peter bartlett'] = 'p. bartlett'
duplicate_conversion['r. e. shapire'] = 'r. e. schapire'
duplicate_conversion['r. rivest'] = 'r. l. rivest'
duplicate_conversion['ronald l. rivest'] = 'r. l. rivest'
duplicate_conversion['s. goldman'] = 's. a. goldman'
duplicate_conversion['sally a. goldman'] = 's. a. goldman' 
duplicate_conversion['ron kohavi wolpert'] = 'r. k. wolpert'
duplicate_conversion['satinder singh'] = 's. singh'
duplicate_conversion['singer'] = 'y. singer'
duplicate_conversion['w. lee'] = 'w. s. lee'
duplicate_conversion['wee sun lee'] = 'w. s. lee'
duplicate_conversion['yoram singer'] = 'y. singer'
duplicate_conversion['singer'] = 'y. singer'
duplicate_conversion['m. kearns. l. g. valiant'] = 'm. j. kearns'


unknown = {'kearns', 'm', 'n', 'y'}



authors_corrected = []
for author in all_authors:
    if author in duplicate_conversion:
        print("True!!!", author)
        corrected = duplicate_conversion[author]
        authors_corrected.append(corrected)
    elif author in unknown:
        pass
    else:
        authors_corrected.append(author)

authors_corrected_text = "\n".join(sorted(set(authors_corrected)))

print(authors_corrected_text)

timestamp = time.strftime("%Y%m%d-%H%M%S")
output = f"datasets\\temp\\authors_{timestamp}.txt"
os.makedirs(os.path.dirname(output), exist_ok=True)
with open(output, "w", encoding="utf-8") as f:
    f.write(authors_corrected_text)

In [None]:
authors_corrected_list = list(authors_corrected)
connections = []
for i, authors in enumerate(final_author_lists, 1):
    # add tuples of all author pairs for this paper
    for j in range(len(authors)):
        for k in range(j + 1, len(authors)):
            connections.append((authors[j], authors[k]))

print(connections)
connections_corrected = []
for a, b in connections:
    if a == b:
        continue
    if a in duplicate_conversion:
        a = duplicate_conversion[a]
    if b in duplicate_conversion:
        b = duplicate_conversion[b]
    # skip if either is unknown
    if a in unknown or b in unknown:
        continue
    connections_corrected.append((a, b))
# remove duplicates
connections_corrected = list(set(connections_corrected))

for a, b in connections_corrected:
    if a == b:
        print("self loop detected:", a, b)
        # remove self loops
        connections_corrected.remove((a, b))

connection_lines = [f"({a},{b})" for a, b in connections_corrected]
connections_text = "\n".join(sorted(connection_lines))
timestamp = time.strftime("%Y%m%d-%H%M%S")
output = f"datasets\\temp\\author_connections_{timestamp}.txt"
os.makedirs(os.path.dirname(output), exist_ok=True)
with open(output, "w", encoding="utf-8") as f:
    f.write(connections_text)