In [16]:
import csv
import random
from faker import Faker
from datetime import datetime, timedelta

In [17]:
fake = Faker()

# Configuration: how many of each type of node to create
num_authors = 50         
num_papers = 150         
num_keywords = 40        
num_conferences = 15    
num_venues = 15         
num_journals = 10       
num_volumes = 20    


In [18]:
topics = [
    "This paper explores the impact of machine learning algorithms on data analysis efficiency.",
    "We present a novel approach for optimizing graph database queries.",
    "This study analyzes the effects of large-scale distributed systems in cloud computing.",
    "In this work, we investigate the security challenges in IoT networks.",
    "This paper proposes a new model for natural language processing tasks.",
    "The research examines the evolution of data privacy regulations worldwide.",
    "An empirical study on the performance of blockchain technologies.",
    "We provide a comparative analysis of various AI optimization techniques.",
    "This study evaluates the scalability of real-time recommendation systems.",
    "A new framework for cybersecurity threat detection is introduced."
]

# Components to generate unique titles
adjectives = ["Efficient", "Scalable", "Robust", "Secure", "Advanced", "Distributed", "Optimized", "Flexible"]
nouns = ["Framework", "Model", "Approach", "Architecture", "Method", "Algorithm", "Technique", "System"]
fields = [
    "Machine Learning",
    "Blockchain",
    "Cybersecurity",
    "Natural Language Processing",
    "Quantum Computing",
    "Data Privacy",
    "Graph Databases",
    "Cloud Computing",
    "Healthcare AI",
    "IoT Networks",
]

# Helper function to generate a unique title
def generate_unique_title(existing_titles):
    while True:
        title = f"{random.choice(adjectives)} {random.choice(nouns)} for {random.choice(fields)}"
        if title not in existing_titles:
            existing_titles.add(title)
            return title

num_papers = 150

existing_titles = set()

In [20]:
# Create authors
with open('authors.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'name'])  # header
    for i in range(1, num_authors + 1):
        writer.writerow([i, fake.name()])

# Create keywords
with open('keywords.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'name'])  # header
    for i in range(1, num_keywords + 1):
        writer.writerow([i, fake.word()])

# Create papers
with open('papers.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'title', 'abstract', 'doi'])  # header
    for i in range(1, num_papers + 1):
        title = generate_unique_title(existing_titles)
        abstract = random.choice(topics)
        doi = fake.uuid4()
        writer.writerow([i, title, abstract, doi])

# Create proceeding editions
with open('proceedings.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'number', 'start_date', 'end_date'])  # header
    for i in range(1, num_conferences + 1):
        start = fake.date_between(start_date='-5y', end_date='today')
        end = start + timedelta(days=3)  # conference lasts 3 days
        writer.writerow([i, i, start, end])

# Create venues
with open('venues.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'venue_name'])  # header
    for i in range(1, num_venues + 1):
        writer.writerow([i, fake.city()])

# Create journals
with open('journals.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'journal_name'])  # header
    for i in range(1, num_journals + 1):
        writer.writerow([i, fake.company()])

# Create journal volumes
with open('journal_volumes.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'volume', 'year', 'issue'])  # header
    for i in range(1, num_volumes + 1):
        writer.writerow([i, random.randint(1, 100), random.randint(2018, 2024), random.randint(1, 4)])

# Create citation relationships between papers
with open('citations.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['source_paper', 'target_paper'])  # header
    for _ in range(300):  # <-- change from 10 to 300
        src = random.randint(1, num_papers)
        tgt = random.randint(1, num_papers)
        if src != tgt:  # avoid self-citation
            writer.writerow([src, tgt])

# Create WROTE relationships (author wrote paper)
with open('wrote.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['author_id', 'paper_id', 'corresponding'])
    for _ in range(200):
        writer.writerow([
            random.randint(1, num_authors),
            random.randint(1, num_papers),
            random.choice([True, False])
        ])

# Create HAS_KEYWORD relationships (paper has keyword)
with open('has_keyword.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['paper_id', 'keyword_id'])
    for _ in range(300):
        writer.writerow([
            random.randint(1, num_papers),
            random.randint(1, num_keywords)
        ])

# Create PUBLISHED_IN relationships (paper published in proceeding or journal_volume)
with open('published_in.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['paper_id', 'proceeding_id', 'journal_volume_id', 'date_accepted', 'pages'])
    for _ in range(150):
        paper_id = random.randint(1, num_papers)
        if random.random() < 0.5:
            proceeding_id = random.randint(1, num_conferences)
            journal_volume_id = ''
        else:
            proceeding_id = ''
            journal_volume_id = random.randint(1, num_volumes)
        date_accepted = fake.date_between(start_date='-3y', end_date='today')
        pages = f"{random.randint(1, 10)}-{random.randint(11, 20)}"
        writer.writerow([paper_id, proceeding_id, journal_volume_id, date_accepted, pages])

# Create HELD_IN relationships (proceeding edition held in venue)
with open('held_in.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['proceeding_id', 'venue_id'])
    for i in range(1, num_conferences + 1):
        writer.writerow([
            i,
            random.randint(1, num_venues)
        ])

# Create _RELATED relationships (proceeding edition related to something)
with open('related.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['proceeding_id', 'type', 'name'])
    for i in range(1, num_conferences + 1):
        writer.writerow([
            i,
            random.choice(['Workshop', 'Symposium', 'Seminar']),
            fake.bs().title()
        ])
