# 0. Libraries

In [17]:
import csv
import random
from faker import Faker
from datetime import datetime, timedelta
import os

In [18]:
directory = "data_lab1"

# Create directory for saving the data if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully")
else:
    print(f"Directory '{directory}' already exists")

Directory 'data_lab1' created successfully


# 1. Adjustable parameters

In [19]:
fake = Faker()

# Configuration: how many of each type of node to create
num_authors = 50         
num_papers = 150         
num_keywords = 40
num_proceedings = 15        
num_proceeding_editions = 20    
num_venues = 15         
num_journals = 10       
num_volumes = 20    


Below we include some sample topics and words for creating fake data.

In [20]:
topics = [
    "This paper explores the impact of machine learning algorithms on data analysis efficiency.",
    "We present a novel approach for optimizing graph database queries.",
    "This study analyzes the effects of large-scale distributed systems in cloud computing.",
    "In this work, we investigate the security challenges in IoT networks.",
    "This paper proposes a new model for natural language processing tasks.",
    "The research examines the evolution of data privacy regulations worldwide.",
    "An empirical study on the performance of blockchain technologies.",
    "We provide a comparative analysis of various AI optimization techniques.",
    "This study evaluates the scalability of real-time recommendation systems.",
    "A new framework for cybersecurity threat detection is introduced."
]

# Components to generate unique titles
adjectives = ["Efficient", "Scalable", "Robust", "Secure", "Advanced", "Distributed", "Optimized", "Flexible"]
nouns = ["Framework", "Model", "Approach", "Architecture", "Method", "Algorithm", "Technique", "System"]
fields = [
    "Machine Learning",
    "Blockchain",
    "Cybersecurity",
    "Natural Language Processing",
    "Quantum Computing",
    "Data Privacy",
    "Graph Databases",
    "Cloud Computing",
    "Healthcare AI",
    "IoT Networks",
]

# Helper function to generate a unique title
def generate_unique_title(existing_titles):
    while True:
        title = f"{random.choice(adjectives)} {random.choice(nouns)} for {random.choice(fields)}"
        if title not in existing_titles:
            existing_titles.add(title)
            return title

num_papers = 150

existing_titles = set()

# 2. Creating the data (the `.csv`)

## 2.1. Nodes

In [21]:
# Create authors
with open(os.path.join('data_lab1', 'authors.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'name'])  # header
    for i in range(1, num_authors + 1):
        writer.writerow([i, fake.name()])

# Create keywords
with open(os.path.join('data_lab1', 'keywords.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'name'])  # header
    for i in range(1, num_keywords + 1):
        writer.writerow([i, fake.word()])

# Create papers
with open(os.path.join('data_lab1', 'papers.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'title', 'abstract', 'doi'])  # header
    for i in range(1, num_papers + 1):
        title = generate_unique_title(existing_titles)
        abstract = random.choice(topics)
        doi = fake.uuid4()
        writer.writerow([i, title, abstract, doi])

# Create proceedings
with open(os.path.join('data_lab1', 'proceedings.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'proceeding_name', 'proceeding_type'])  # header
    for i in range(1, num_proceedings + 1):
        writer.writerow([i, 
                         fake.company(),
                         random.choice(['Conference', 'Workshop'])])

# Create proceeding editions (subnodes of proceedings) WITH PARENT REFERENCE
with open(os.path.join('data_lab1', 'proceeding_editions.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'number', 'start_date', 'end_date', 'proceeding_name'])  # header
    
    # We need to load proceedings data to reference parent names
    proceedings_data = {}
    with open(os.path.join('data_lab1', 'proceedings.csv'), 'r', newline='') as proc_file:
        proc_reader = csv.reader(proc_file)
        next(proc_reader)  # skip header
        for row in proc_reader:
            proceedings_data[int(row[0])] = row[1]  # Map ID to name
    
    # Now create proceeding editions with parent reference
    proceeding_edition_to_parent = {}
    for i in range(1, num_proceeding_editions + 1):
        # Assign this edition to a specific proceeding
        parent_id = random.randint(1, num_proceedings)
        proceeding_edition_to_parent[i] = parent_id
        
        start = fake.date_between(start_date='-5y', end_date='today')
        end = start + timedelta(days=3)  # conference lasts 3 days
        writer.writerow([i, i, start, end, proceedings_data[parent_id]])

# Create venues
with open(os.path.join('data_lab1', 'venues.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'venue_name'])  # header
    for i in range(1, num_venues + 1):
        writer.writerow([i, fake.city()])

# Create journals
with open(os.path.join('data_lab1', 'journals.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'journal_name'])  # header
    for i in range(1, num_journals + 1):
        writer.writerow([i, fake.company()])

# Create journal volumes (subnodes of journals) WITH PARENT REFERENCE
with open(os.path.join('data_lab1', 'journal_volumes.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'volume', 'year', 'issue', 'journal_name'])  # header

    # Load journals data to reference parent names
    journals_data = {}
    with open(os.path.join('data_lab1', 'journals.csv'), 'r', newline='') as journal_file:
        journal_reader = csv.reader(journal_file)
        next(journal_reader)  # skip header
        for row in journal_reader:
            journals_data[int(row[0])] = row[1]  # Map ID to name
    
    # Now create journal volumes with parent reference
    volume_to_parent = {}
    for i in range(1, num_volumes + 1):
        # Assign this volume to a specific journal
        parent_id = random.randint(1, num_journals)
        volume_to_parent[i] = parent_id
        
        writer.writerow([i, 
                         random.randint(1, 100), 
                         random.randint(2018, 2024), 
                         random.randint(1, 4),
                         journals_data[parent_id]])

## 2.2. Edges

In [22]:
# Create citation relationships between papers (CITES)
with open(os.path.join('data_lab1', 'cites.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['source_paper', 'target_paper'])  # header
    for _ in range(300):  # <-- change from 10 to 300
        src = random.randint(1, num_papers)
        tgt = random.randint(1, num_papers)
        if src != tgt:  # avoid self-citation
            writer.writerow([src, tgt])

# Create WROTE relationships (author wrote paper)
with open(os.path.join('data_lab1', 'wrote.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['author_id', 'paper_id', 'corresponding'])
    for _ in range(200):
        writer.writerow([
            random.randint(1, num_authors),
            random.randint(1, num_papers),
            random.choice([True, False])
        ])

# Create HAS_KEYWORD relationships (paper has keyword)
with open(os.path.join('data_lab1', 'has_keyword.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['paper_id', 'keyword_id'])
    for _ in range(300):
        writer.writerow([
            random.randint(1, num_papers),
            random.randint(1, num_keywords)
        ])

# Create PUBLISHED_IN relationships (paper published in proceeding or journal_volume)
with open(os.path.join('data_lab1', 'published_in.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['paper_id', 'proceeding_edition_id', 'journal_volume_id', 'date_accepted', 'pages'])
    for _ in range(150):
        paper_id = random.randint(1, num_papers)
        if random.random() < 0.5:
            proceeding_id = random.randint(1, num_proceeding_editions)
            journal_volume_id = ''
        else:
            proceeding_id = ''
            journal_volume_id = random.randint(1, num_volumes)
        date_accepted = fake.date_between(start_date='-3y', end_date='today')
        pages = f"{random.randint(1, 10)}-{random.randint(11, 20)}"
        writer.writerow([paper_id, proceeding_id, journal_volume_id, date_accepted, pages])

# Create HAS_VOLUME relationships (journal has volume) - consistent with the volume_to_parent mapping
with open(os.path.join('data_lab1', 'has_volume.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['journal_id', 'volume_id'])
    for volume_id, journal_id in volume_to_parent.items():
        writer.writerow([journal_id, volume_id])

# Create HAS_EDITION relationships (proceeding has edition) - consistent with the proceeding_edition_to_parent mapping
with open(os.path.join('data_lab1', 'has_edition.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['proceeding_id', 'edition_id'])
    for edition_id, proceeding_id in proceeding_edition_to_parent.items():
        writer.writerow([proceeding_id, edition_id])

# Create HELD_IN relationships (proceeding edition held in venue)
with open(os.path.join('data_lab1', 'held_in.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['proceeding_id', 'venue_id'])
    for i in range(1, num_proceeding_editions + 1):
        writer.writerow([
            i,
            random.randint(1, num_venues)
        ])