In [None]:
import csv
from datetime import datetime
from faker import Faker
import numpy as np


In [None]:
fake = Faker()
rng = np.random.default_rng()

In [None]:
# If you change the max number of authors, publishers, or titles,
# consider changing the distribution shape so things still look reasonable.

AUTHOR_DIST_SHAPE = 3
MAX_NUM_AUTHORS = 300

PUBLISHER_DIST_SHAPE = 1
MAX_NUM_PUBLISHERS = 15

TITLES_DIST_SHAPE = 1
NUM_TITLES = 1000

NUM_PATRONS = 1000

In [None]:
publisher_raw_distribution = rng.gamma(PUBLISHER_DIST_SHAPE, size=MAX_NUM_PUBLISHERS)
publisher_distribution = publisher_raw_distribution  / sum(publisher_raw_distribution)
publisher_list = [fake.company() for _ in range(MAX_NUM_PUBLISHERS)]
def get_random_publisher():
    return rng.choice(publisher_list, p=publisher_distribution)

sorted(list(publisher_distribution * NUM_TITLES))

In [None]:
 def get_author_website(first_name, last_name):
    scheme = rng.choice(['https', 'http'], p=[0.8, 0.2])
    return f'{scheme}://{last_name if rng.random() > 0.5 else first_name + last_name}.{fake.tld()}'.lower()


author_raw_distribution = rng.gamma(AUTHOR_DIST_SHAPE, size=MAX_NUM_AUTHORS)
author_distribution = author_raw_distribution / sum(author_raw_distribution)
author_list = [
    {
        'first_name': (first_name := fake.first_name()),
        'last_name': (last_name := fake.last_name()),
        'website': get_author_website(first_name, last_name) if rng.random() > 0.3 else None
    }
    for _ in range(MAX_NUM_AUTHORS)
]

def get_random_author():
    return rng.choice(author_list, p=author_distribution)

sorted([min(author_distribution) * NUM_TITLES, max(author_distribution) * NUM_TITLES])

In [None]:
def get_book_copies(
    title, 
    isbn,
    author_first_name, 
    author_last_name, 
    author_website, 
    publisher, 
    publication_year,
    num_copies,
):
    if num_copies <= 0:
        return []
    
    if rng.random() > 0.8 and publication_year < 2022:  # we update publication info
        isbn = fake.isbn10()
        publisher = get_random_publisher() if rng.random() > 0.6 else publisher
        publication_year = rng.integers(publication_year, high=2022)
    
    return [
        {
            'title': title,
            'isbn': isbn,
            'author_first_name': author_first_name,
            'author_last_name': author_last_name,
            'author_website': author_website,
            'publisher': publisher,
            'publication_year': publication_year,
            'acquisition_date': fake.date_between(datetime(publication_year, 1, 1), 'today').strftime('%Y-%m-%d'),
            'acquisition_price': f'${rng.random() * 15:.2f}',
        }
    ] + get_book_copies(
        title, 
        isbn,
        author_first_name, 
        author_last_name, 
        author_website, 
        publisher, 
        publication_year,
        num_copies - 1,
    )

In [None]:
books = [
    book_row
    for _ in range(NUM_TITLES)
    for author_dict in [get_random_author()]
    for book_row in get_book_copies(
        fake.text(max_nb_chars=30).strip('.').title(),
        fake.isbn10(),
        author_dict['first_name'],
        author_dict['last_name'],
        author_dict['website'],
        get_random_publisher(),
        rng.integers(1900, high=2022),
        int(rng.gamma(1, 2)) + 1
    )
]

In [None]:
len(books)

In [None]:
book_fields = [
    'title',
    'isbn',
    'author_first_name',
    'author_last_name',
    'author_website',
    'publisher',
    'publication_year',
    'acquisition_date',
    'acquisition_price',
]
with open('books_sim.tsv', 'w', newline='') as f:
    writer = csv.DictWriter(f, book_fields, dialect='excel-tab')
    writer.writeheader()
    writer.writerows(books)

In [None]:
def get_personal_email(first_name, last_name):
    abbr_first_name = rng.choice([first_name, first_name[0]])
    abbr_last_name = rng.choice([last_name, last_name[0]]) if len(abbr_first_name) > 3 else last_name
    separator = rng.choice(['.', ''])
    number = rng.integers(5, high=99) if rng.random() > 0.5 else ''
    return f'{abbr_first_name}{separator}{abbr_last_name}{number}@{fake.domain_name()}'.lower()


patron_list = [
    {
        'first_name': (first_name := fake.first_name()), 
        'last_name': (last_name := fake.last_name()), 
        'email': get_personal_email(first_name, last_name)
    }
    for _ in range(NUM_PATRONS)
]

In [None]:
patron_fields = ['first_name', 'last_name', 'email']
with open ('patrons_sim.tsv', 'w', newline='') as f:
    writer = csv.DictWriter(f, patron_fields, dialect='excel')
    writer.writeheader()
    writer.writerows(patron_list)