In [31]:
%matplotlib inline
%load_ext sql
import numpy as np
import pandas as pd
from pydbgen import pydbgen
from faker import Faker
from pandas.tseries.offsets import *
from sqlalchemy import create_engine

fake = Faker()
myDB=pydbgen.pydb()

# Circulation Dataset

In [6]:
engine = create_engine('sqlite:///../data/circulation.sqlite')

## Generate Patron data

In [4]:
nrows = 1000
patrons = myDB.gen_dataframe(num=nrows,
                             fields=['id', 'name', 'age', 'email', 'zipcode'],
                             real_email=False, seed=123876)
patrons['id'] = range(1, nrows+1)
# Generate age from a normal distribution, mean 35
patrons['age'] = (np.random.randn(nrows) * 15 + 35).astype(np.int)
patrons.loc[patrons['age'] < 0, 'age'] = 0
# Generate cities, random but from a normal dist
city_names = myDB.gen_data_series(num=100, data_type='city')
patrons['city'] = city_names[(np.random.randn(nrows) * 8 + 50).astype(int)].values
# For generating circulation info, give each patron a fake rating for how active they are
patrons['activity'] = np.random.randint(10, size=nrows)
patrons.head(2)

Unnamed: 0,id,name,age,email,zipcode,city,activity
0,1,Jeremy Mata,30,timothymoore@gmail.com,52022,Thomasmouth,9
1,2,Daniel Garcia,55,mcdanielwilliam@sanchez-herrera.biz,9529,Deborahbury,3


In [7]:
patrons.to_sql('patrons', engine, index=False)

## Generate Books

In [5]:
# Data downloaded from Kaggle: 
data = pd.read_csv("../data/br-trimmed.csv.bz2", compression='bz2')
# remove review data, keeping only one instance of each book, with ratingsCount to use as sampling weight
data = data.groupby(['title', 'author'], as_index=False)[['ratingsCount']].max()

nbooks = 2000
books = data.sample(n=nbooks, weights=data.ratingsCount)
books['isbn'] = [fake.isbn10(separator="-") for i in range(nbooks)]
books['id'] = range(1, nbooks+1)
books.sample(2)

Unnamed: 0,title,author,ratingsCount,isbn,id
122005,La Metamorfosis/ The Metamorphosis,Franz Kafka,423657,1-05-232002-3,1574
95205,Harry Potter and the Philosopher's Stone (Harr...,J.K. Rowling,4920627,1-203-98661-0,1089


In [8]:
books[['id', 'title', 'author', 'isbn']].to_sql('books', engine, index=False)

## Generate Fake Circulation Data

In [28]:
ncheckouts = 5000

book_sampling = np.random.choice(books.index, size=ncheckouts, p=books.ratingsCount/books.ratingsCount.sum())
book_ids = books.loc[book_sampling]['id'].values

patron_sampling = np.random.choice(patrons.index, size=ncheckouts, p=patrons.activity/patrons.activity.sum())
patron_ids = patrons.loc[patron_sampling]['id'].values

circ = pd.DataFrame([book_ids, patron_ids]).T.rename(columns={0:'book_id', 1:'patron_id'})

# Generate dates from past year
dates = pd.Series([fake.date_time_between(start_date="-1y", end_date="now", tzinfo=None) for i in range(ncheckouts)])
circ['checkout_time'] = pd.to_datetime(dates)

# Mark recent checkouts (from past four weeks) as not returned
circ['return_time'] = circ.checkout_time + pd.Series(np.random.randn(ncheckouts) + 3.5).apply(Week)
circ.loc[circ.checkout_time > (circ.checkout_time.max() - DateOffset(weeks=4)), 'return_time'] = np.nan

circ.head(2)



Unnamed: 0,book_id,patron_id,checkout_time,return_time
0,1898,862,2017-05-11 16:53:33,2017-06-09 16:40:11.804214
1,450,215,2018-01-05 13:07:20,2018-01-23 10:00:01.071255


In [29]:
circ.to_sql('circulation', engine, index=False)

# Testing

In [32]:
%sql sqlite:///../data/circulation.sqlite

'Connected: None@../data/circulation.sqlite'

In [33]:
%%sql
SELECT * FROM circulation LIMIT 10;

Done.


book_id,patron_id,checkout_time,return_time
1898,862,2017-05-11 16:53:33.000000,2017-06-09 16:40:11.804214
450,215,2018-01-05 13:07:20.000000,2018-01-23 10:00:01.071255
1840,148,2018-01-29 22:25:54.000000,2018-03-03 05:15:48.780117
235,58,2017-06-21 07:27:02.000000,2017-07-15 04:34:52.054251
522,801,2017-08-29 16:19:26.000000,2017-09-29 18:25:42.337822
213,411,2017-08-13 15:10:10.000000,2017-09-08 13:37:02.209165
893,816,2017-09-26 11:38:24.000000,2017-10-22 06:42:34.169105
441,380,2017-09-09 16:34:28.000000,2017-09-27 00:27:15.345828
979,724,2017-10-16 18:47:49.000000,2017-11-14 03:43:09.035133
229,241,2017-08-18 18:56:03.000000,2017-09-08 13:02:49.875579
