# 0. Libraries

In [65]:
import csv
import random
from faker import Faker
from datetime import datetime, timedelta
import os

In [66]:
directory = "data_lab1"

# Create directory for saving the data if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully")
else:
    print(f"Directory '{directory}' already exists")

Directory 'data_lab1' created successfully


# 1. Adjustable parameters

In [67]:
fake = Faker()

# Configuration: how many of each type of node to create
num_authors = 50         
num_papers = 150         
num_keywords = 40
num_proceedings = 15        
max_num_proceeding_editions = 1000  # Overall number of editions (for different proceedings)    
num_venues = 15         
num_journals = 10       
max_num_volumes = 1000  # Overall number of volumes (for different journals)    
max_num_citations = 100
max_num_coauthors = 10
max_num_keywords = 20

Below we include some sample topics and words for creating fake data.

In [68]:
topics = [
    "This paper explores the impact of machine learning algorithms on data analysis efficiency.",
    "We present a novel approach for optimizing graph database queries.",
    "This study analyzes the effects of large-scale distributed systems in cloud computing.",
    "In this work, we investigate the security challenges in IoT networks.",
    "This paper proposes a new model for natural language processing tasks.",
    "The research examines the evolution of data privacy regulations worldwide.",
    "An empirical study on the performance of blockchain technologies.",
    "We provide a comparative analysis of various AI optimization techniques.",
    "This study evaluates the scalability of real-time recommendation systems.",
    "A new framework for cybersecurity threat detection is introduced."
]

# Components to generate unique titles
adjectives = ["Efficient", "Scalable", "Robust", "Secure", "Advanced", "Distributed", "Optimized", "Flexible"]
nouns = ["Framework", "Model", "Approach", "Architecture", "Method", "Algorithm", "Technique", "System"]
fields = [
    "Machine Learning",
    "Blockchain",
    "Cybersecurity",
    "Natural Language Processing",
    "Quantum Computing",
    "Data Privacy",
    "Graph Databases",
    "Cloud Computing",
    "Healthcare AI",
    "IoT Networks",
]

# Helper function to generate a unique title
def generate_unique_title(existing_titles):
    while True:
        title = f"{random.choice(adjectives)} {random.choice(nouns)} for {random.choice(fields)}"
        if title not in existing_titles:
            existing_titles.add(title)
            return title

num_papers = 150

existing_titles = set()

# 2. Creating the data (the `.csv`)

## 2.1. Nodes

In [69]:
# Create authors
with open(os.path.join('data_lab1', 'authors.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'name'])  # header
    for i in range(1, num_authors + 1):
        writer.writerow([i, fake.name()])

# Create keywords
with open(os.path.join('data_lab1', 'keywords.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'name'])  # header
    for i in range(1, num_keywords + 1):
        writer.writerow([i, fake.word()])

# Create papers
with open(os.path.join('data_lab1', 'papers.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'title', 'abstract', 'doi'])  # header
    for i in range(1, num_papers + 1):
        title = generate_unique_title(existing_titles)
        abstract = random.choice(topics)
        doi = fake.uuid4()
        writer.writerow([i, title, abstract, doi])

# Create proceedings
with open(os.path.join('data_lab1', 'proceedings.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'proceeding_name', 'proceeding_type'])  # header
    for i in range(1, num_proceedings + 1):
        writer.writerow([i, 
                         fake.company(),
                         random.choice(['Conference', 'Workshop'])])

# Create proceeding editions (subnodes of proceedings) WITH PARENT REFERENCE
with open(os.path.join('data_lab1', 'proceeding_editions.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'number', 'start_date', 'end_date', 'proceeding_name'])  # header
    
    # We need to load proceedings data to reference parent names
    proceedings_data = {}
    with open(os.path.join('data_lab1', 'proceedings.csv'), 'r', newline='') as proc_file:
        proc_reader = csv.reader(proc_file)
        next(proc_reader)  # skip header
        for row in proc_reader:
            proceedings_data[int(row[0])] = row[1]  # Map ID to name
    
    # Now create proceeding editions with parent reference
    proceeding_edition_to_parent = {}
    # Ensure each proceeding has at least 1 edition
    for proceeding_id in range(1, num_proceedings + 1):
        # Each proceeding will have 1-20 editions
        num_editions_per_proceeding = random.randint(1, 20)
        
        for edition_num in range(1, num_editions_per_proceeding + 1):
            edition_id = len(proceeding_edition_to_parent) + 1
            if edition_id > max_num_proceeding_editions:
                break  # Stop if we've reached the maximum number of editions
                
            proceeding_edition_to_parent[edition_id] = proceeding_id
            
            # Make dates sequential by year for editions of the same proceeding
            # Start from recent years and go back for older editions
            year = max(1970, 2024 - edition_num)  # Ensure we never go below 1970
            start = fake.date_between(
                start_date=datetime(year, 1, 1), 
                end_date=datetime(year, 12, 31)
            )
            end = start + timedelta(days=3)  # conference lasts 3 days
            
            writer.writerow([edition_id, edition_num, start, end, proceedings_data[proceeding_id]])

# Create venues
with open(os.path.join('data_lab1', 'venues.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'venue_name'])  # header
    for i in range(1, num_venues + 1):
        writer.writerow([i, fake.city()])

# Create journals
with open(os.path.join('data_lab1', 'journals.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'journal_name'])  # header
    for i in range(1, num_journals + 1):
        writer.writerow([i, fake.company()])

# Create journal volumes (subnodes of journals) WITH PARENT REFERENCE
with open(os.path.join('data_lab1', 'journal_volumes.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'volume', 'year', 'issue', 'journal_name'])  # header

    # Load journals data to reference parent names
    journals_data = {}
    with open(os.path.join('data_lab1', 'journals.csv'), 'r', newline='') as journal_file:
        journal_reader = csv.reader(journal_file)
        next(journal_reader)  # skip header
        for row in journal_reader:
            journals_data[int(row[0])] = row[1]  # Map ID to name
    
    # Now create journal volumes with parent reference
    volume_to_parent = {}
    # Ensure each journal has at least 1 volume
    for journal_id in range(1, num_journals + 1):
        # Each journal will have 1-200 volumes
        num_volumes_per_journal = random.randint(1, 200)
        
        for _ in range(num_volumes_per_journal):
            volume_id = len(volume_to_parent) + 1
            if volume_id > max_num_volumes:
                break  # Stop if we've reached the maximum number of volumes
                
            volume_to_parent[volume_id] = journal_id
            
            writer.writerow([volume_id, 
                            random.randint(1, 100), 
                            random.randint(2018, 2024), 
                            random.randint(1, 4),
                            journals_data[journal_id]])

## 2.2. Edges

In [70]:
# Create citation relationships between papers (CITES)
with open(os.path.join('data_lab1', 'cites.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['source_paper', 'target_paper'])  # header
    
    # Previous code (only one citation per paper) was changed to allow multiple citations
    # for _ in range(300):  # <-- change from 10 to 300
    #     src = random.randint(1, num_papers)
    #     tgt = random.randint(1, num_papers)
    #     if src != tgt:  # avoid self-citation
    #         writer.writerow([src, tgt])

    # Track citations to avoid duplicates
    citation_pairs = set()
    
    # Ensure each paper cites at least one other paper (except some very foundational papers)
    for source_id in range(1, num_papers + 1):
        # Decide how many papers this paper cites (0-max_num_citations)
        # Newer papers tend to cite more papers
        num_citations = random.randint(0 if source_id < 20 else 3, max_num_citations)
        
        # Find papers to cite (can't cite itself)
        potential_targets = [p for p in range(1, num_papers + 1) if p != source_id]
        
        # Select random targets to cite
        if potential_targets and num_citations > 0:
            targets_to_cite = random.sample(potential_targets, 
                                           min(num_citations, len(potential_targets)))
            
            for target_id in targets_to_cite:
                # Avoid duplicate citations
                if (source_id, target_id) not in citation_pairs:
                    citation_pairs.add((source_id, target_id))
                    writer.writerow([source_id, target_id])

# Create WROTE relationships (author wrote paper)
with open(os.path.join('data_lab1', 'wrote.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['author_id', 'paper_id', 'corresponding'])
    
    # Previous code (only one author per paper) was changed to allow multiple authors
    # for _ in range(200):
    #     writer.writerow([
    #         random.randint(1, num_authors),
    #         random.randint(1, num_papers),
    #         random.choice([True, False])
    #     ])

    # Ensure each paper has at least one author and some have multiple
    paper_authors = {}  # Track authors per paper
    
    # First, assign at least one author to each paper
    for paper_id in range(1, num_papers + 1):
        author_id = random.randint(1, num_authors)
        paper_authors[paper_id] = [author_id]
        writer.writerow([
            author_id,
            paper_id,
            True  # First author is corresponding
        ])
    
    # Then add co-authors to some papers (1-max_num_coauthors additional authors)
    for paper_id in range(1, num_papers + 1):
        num_coauthors = random.randint(0, max_num_coauthors)  # Some papers get additional authors
        
        # Get authors who aren't already on this paper
        available_authors = [a for a in range(1, num_authors + 1) 
                             if a not in paper_authors[paper_id]]
        
        # Choose random co-authors from available authors
        if available_authors and num_coauthors > 0:
            coauthors = random.sample(available_authors, 
                                      min(num_coauthors, len(available_authors)))
            
            for coauthor_id in coauthors:
                paper_authors[paper_id].append(coauthor_id)
                writer.writerow([
                    coauthor_id,
                    paper_id,
                    False  # Co-authors are not corresponding
                ])

# Create HAS_KEYWORD relationships (paper has keyword)
with open(os.path.join('data_lab1', 'has_keyword.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['paper_id', 'keyword_id'])
    
    # Previous code (only one keyword per paper) was changed to allow multiple keywords
    # for _ in range(300):
    #     writer.writerow([
    #         random.randint(1, num_papers),
    #         random.randint(1, num_keywords)
    #     ])

    # Track paper-keyword pairs to avoid duplicates
    paper_keyword_pairs = set()
    
    # Ensure each paper has at least 2-max_num_keywords keywords
    for paper_id in range(1, num_papers + 1):
        # Decide how many keywords this paper has (2-max_num_keywords)
        num_paper_keywords = random.randint(2, max_num_keywords)
        
        # Select random keywords
        keywords_to_assign = random.sample(range(1, num_keywords + 1), 
                                          min(num_paper_keywords, num_keywords))
        
        for keyword_id in keywords_to_assign:
            # Avoid duplicate keyword assignments
            if (paper_id, keyword_id) not in paper_keyword_pairs:
                paper_keyword_pairs.add((paper_id, keyword_id))
                writer.writerow([paper_id, keyword_id])
    
    # Add some additional keyword relationships to create more connections
    # (some keywords will appear in multiple papers)
    additional_relationships = 100
    for _ in range(additional_relationships):
        paper_id = random.randint(1, num_papers)
        keyword_id = random.randint(1, num_keywords)
        
        if (paper_id, keyword_id) not in paper_keyword_pairs:
            paper_keyword_pairs.add((paper_id, keyword_id))
            writer.writerow([paper_id, keyword_id])

# Create PUBLISHED_IN relationships (paper published in proceeding or journal_volume)
with open(os.path.join('data_lab1', 'published_in.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['paper_id', 'proceeding_edition_id', 'journal_volume_id', 'date_accepted', 'pages'])
    for _ in range(150):
        paper_id = random.randint(1, num_papers)
        if random.random() < 0.5:
            proceeding_id = random.randint(1, max_num_proceeding_editions)
            journal_volume_id = ''
        else:
            proceeding_id = ''
            journal_volume_id = random.randint(1, max_num_volumes)
        date_accepted = fake.date_between(start_date='-3y', end_date='today')
        pages = f"{random.randint(1, 10)}-{random.randint(11, 20)}"
        writer.writerow([paper_id, proceeding_id, journal_volume_id, date_accepted, pages])

# Create HAS_VOLUME relationships (journal has volume) - consistent with the volume_to_parent mapping
with open(os.path.join('data_lab1', 'has_volume.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['journal_id', 'volume_id'])
    for volume_id, journal_id in volume_to_parent.items():
        writer.writerow([journal_id, volume_id])

# Create HAS_EDITION relationships (proceeding has edition) - consistent with the proceeding_edition_to_parent mapping
with open(os.path.join('data_lab1', 'has_edition.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['proceeding_id', 'edition_id'])
    for edition_id, proceeding_id in proceeding_edition_to_parent.items():
        writer.writerow([proceeding_id, edition_id])

# Create HELD_IN relationships (proceeding edition held in venue)
with open(os.path.join('data_lab1', 'held_in.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['proceeding_id', 'venue_id'])
    for i in range(1, max_num_proceeding_editions + 1):
        writer.writerow([
            i,
            random.randint(1, num_venues)
        ])

# Create REVIEWED relationships (author reviews paper)
with open(os.path.join('data_lab1', 'reviewed.csv'), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow([
        'author_id', 
        'paper_id', 
        'review_date' 
        # 'recommendation'
        ])
    
    # First, load the existing WROTE relationships to avoid assigning authors to review their own papers
    author_paper_pairs = set()
    with open(os.path.join('data_lab1', 'wrote.csv'), 'r', newline='') as wrote_file:
        wrote_reader = csv.reader(wrote_file)
        next(wrote_reader)  # skip header
        for row in wrote_reader:
            author_id = int(row[0])
            paper_id = int(row[1])
            author_paper_pairs.add((author_id, paper_id))
    
    # Assign reviewers to papers
    for paper_id in range(1, num_papers + 1):
        # Decide how many reviewers (1, 3, or 5 - to have majority)
        num_reviewers = random.choice([1, 3, 5])
        
        # Get potential reviewers (those who didn't write this paper)
        potential_reviewers = [author_id for author_id in range(1, num_authors + 1) 
                              if (author_id, paper_id) not in author_paper_pairs]
        
        # Skip if not enough potential reviewers
        if len(potential_reviewers) < num_reviewers:
            continue
            
        # Select random reviewers
        reviewers = random.sample(potential_reviewers, num_reviewers)
        
        # Write review relationships
        for reviewer_id in reviewers:
            review_date = fake.date_between(start_date='-2y', end_date='-1m')
            # recommendation = random.choice(['Accept', 'Minor Revision', 'Major Revision', 'Reject'])
            writer.writerow([
                reviewer_id, 
                paper_id, 
                review_date 
                # recommendation
                ])