In [None]:
import os
import pandas as pd

from dotenv import dotenv_values
from tqdm.notebook import tqdm

from scival_search import RelatedPapers

env_vars = dotenv_values()
COOKIE = env_vars['COOKIE']

df = pd.read_csv("lookups.csv", dtype={"topic": str})
topics = list(set(df['topic'].tolist()))

In [None]:
# Output file
output_file = "related_papers.csv"

# Check which topics have already been processed
processed_topics = set()
if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file, dtype={"Topic": str})
    processed_topics = set(existing_df['Topic'].unique())
    print(f"Found {len(processed_topics)} already processed topics")
else:
    # Create empty CSV with headers
    pd.DataFrame(columns=["EID", "Year", "Topic"]).to_csv(output_file, index=False)
    print("Created new output file")

# Filter out already processed topics
topics_to_process = [t for t in topics if t not in processed_topics]
print(f"Total topics: {len(topics)}")
print(f"Already processed: {len(processed_topics)}")
print(f"To process: {len(topics_to_process)}")

In [None]:
# Retrieve related papers for each topic and append incrementally
failed_topics_file = "failed_topics.txt"

for topic in tqdm(topics_to_process, desc="Topics"):
    try:
        related_papers = RelatedPapers(topic, cookie=COOKIE, show_progress=False)
        results = related_papers.results
        results["Topic"] = topic
        
        # Append to CSV immediately (one topic at a time)
        results[["EID", "Year", "Topic"]].to_csv(
            output_file, 
            mode='a',  # Append mode
            header=False,  # Don't write headers again
            index=False
        )
        
    except Exception as e:
        # Log the failed topic ID
        with open(failed_topics_file, 'a') as f:
            f.write(f"{topic}\n")
        print(f"\nFailed to process topic {topic}: {e}")
        # Continue with next topic 

print(f"\nCompleted! Results saved to {output_file}")
if os.path.exists(failed_topics_file):
    with open(failed_topics_file, 'r') as f:
        failed_count = len(f.readlines())
    print(f"Failed topics: {failed_count} (saved to {failed_topics_file})")


In [None]:
# TODO: Check for duplicates and missing topics
import pandas as pd

df_todo = pd.read_csv("lookups.csv", dtype={"topic": str})
topics = list(set(df_todo['topic'].tolist()))

# Output file
output_file = "related_papers.csv"
df_processed = pd.read_csv(output_file, dtype={"Topic": str})

In [None]:
# Check for duplicates
duplicates = df_processed[df_processed.duplicated(subset=['EID', 'Topic'], keep=False)]
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate entries:")
    print(duplicates)
else:
    print("No duplicate entries found.")

# Check for missing topics
processed_topics = set(df_processed['Topic'].unique())
missing_topics = [t for t in topics if t not in processed_topics]
if missing_topics:
    print(f"Missing topics ({len(missing_topics)}): {missing_topics}")