# Entity Join

In this notebook, we connect Books and Reviewed Books to create the ISBN_Convert entity. We then connect Books and Libraries to create the Library_Books entity.  

In [110]:
%%bigquery
select *
from books_stg.Reviewed_Book
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,book_key,isbn,book_title,book_author,year_of_publication,publisher,language,category,data_source,load_time
0,068410154899,0684101548,The GREAT GATSBY (A Scribner Classic),F. Scott Fitzgerald,1920.0,Scribner,9,9,kaggle,2024-02-06 01:58:41.719424+00:00
1,078688099699,0786880996,Wherever You Go There You Are,John Kabat-Zinn,1920.0,Hyperion Books,9,9,kaggle,2024-02-06 01:58:41.719424+00:00
2,030758013X99,030758013X,Assorted Small Tray Puzzles,Golden,1920.0,Golden Books,9,9,kaggle,2024-02-06 01:58:41.719424+00:00
3,030702051799,0307020517,The New Baby,Golden Books Little,1920.0,Golden Books,9,9,kaggle,2024-02-06 01:58:41.719424+00:00
4,082493069X99,082493069X,Grill and Barbecue Cooking,Ideals Publications Inc,1920.0,Ideals Publications,9,9,kaggle,2024-02-06 01:58:41.719424+00:00


In [111]:
%%bigquery
select *
from books_stg.Books
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,book_id,title,isbn13,language_id,num_pages,publication_date,publisher_id,data_source,load_time
0,6594,Point Of Impact (Bob Lee Swagger #1),9780739344248,1,0,2007-01-09,1692,booksdb,2024-02-06 02:03:04.243233+00:00
1,4505,The Mask of the Enchantress,9780449210840,1,0,1981-10-12,1045,booksdb,2024-02-06 02:03:04.243233+00:00
2,5342,Time To Hunt (Bob Lee Swagger #3),9780553455809,1,0,1998-05-18,1642,booksdb,2024-02-06 02:03:04.243233+00:00
3,6879,The Shining,9780743536998,1,0,2005-08-02,1831,booksdb,2024-02-06 02:03:04.243233+00:00
4,6016,Waterworks,9780679433729,1,0,1994-05-31,1642,booksdb,2024-02-06 02:03:04.243233+00:00


# Generic functions

In [112]:
import json, datetime
from google.cloud import bigquery
bq_client = bigquery.Client()

def serialize_datetime(obj):
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    raise TypeError("Type not serializable")

# removes the entries from the dictionary whose values are None
# this filter is needed for loading JSON into BQ
def remove_none_values(record):
    filtered_record = {}
    for field in record.keys():
        if record[field] != None:
            filtered_record[field] = record[field]
    return filtered_record


def create_table(table_id, schema):

    table = bigquery.Table(table_id, schema=schema)
    table = bq_client.create_table(table, exists_ok=True)
    print("Created table {}".format(table.table_id))


def load_records(table_id, schema, records):

    bq_client = bigquery.Client()

    # load records into staging table
    job_config = bigquery.LoadJobConfig(schema=schema, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, write_disposition='WRITE_TRUNCATE')
    table_ref = bigquery.table.TableReference.from_string(target_table_id)

    try:
        load_job = bq_client.load_table_from_json(records, table_ref, job_config=job_config)
        load_job.result()

        destination_table = bq_client.get_table(target_table_id)
        print("Loaded {} rows.".format(destination_table.num_rows))

        if load_job.errors:
            print('job errors:', load_job.errors)


    except Exception as e:
        print("Error inserting into BQ: {}".format(e))

# Reviewed Books

To match the reviewed books to the books, we will match the isbn13 and book id from the `books` table with isbn10 and book key from the table of `reviewed books`.

We will convert the isbn13 to isbn10 and assign book keys to the correct book id.

In [113]:
%%bigquery
CREATE OR REPLACE FUNCTION books_stg.ISBN13to10(isbn13 STRING) RETURNS STRING
LANGUAGE js AS """
  var isbn10;
  var i;
  var sum = 0;
  var chk;
  var chkchar = 'X';

  if (isbn13.length == 10) {
    return isbn13;
  } else if (isbn13.length != 13 || isbn13.substr(0, 3) != '978') {
    return isbn13;
  }

  i = isbn13.substr(3, 10);

  if (i.length < 10) {
    i = '0'.repeat(10 - i.length) + i;
  }

  for (var j = 0; j < 9; j++) {
    sum += (j + 1) * parseInt(i[j]);
  }

  chk = sum % 11;

  if (chk == 10) {
    chk = 90980;
  }

  if (chk != 90980) {
    chkchar = chk.toString();
  }

  return isbn13.substr(3, 9) + chkchar;
""";


Query is running:   0%|          |

In [114]:
%%bigquery
CREATE OR REPLACE TABLE books_stg.ISBN_Convert AS
SELECT
  book_id,
  isbn13,
  books_stg.ISBN13to10(isbn13) AS isbn10
FROM
  books_stg.Books;

Query is running:   0%|          |

In [115]:
%%bigquery
select * from books_stg.ISBN_Convert;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,book_id,isbn13,isbn10
0,6592,9780739341773,0739341774
1,5793,9780670037568,0670037567
2,7568,9780789493934,0789493934
3,7632,9780802138255,080213825X
4,1191,9780140442724,0140442723
...,...,...,...
11122,3171,9780375724749,0375724745
11123,9853,9781590171240,1590171241
11124,10823,9783491757103,349175710X
11125,10814,9783458334224,345833422X


In [116]:
%%bigquery
CREATE OR REPLACE TABLE books_stg.ISBN_Convert AS
SELECT
  ic.book_id,
  rb.book_key,
  ic.isbn10,
  ic.isbn13,
  'bird+kaggle' as data_source,
  load_time
FROM
  books_stg.Reviewed_Book AS rb
JOIN
  books_stg.ISBN_Convert AS ic
ON
  rb.isbn = ic.isbn10;

Query is running:   0%|          |

In [117]:
%%bigquery
select * from books_stg.ISBN_Convert;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,book_id,book_key,isbn10,isbn13,data_source,load_time
0,5813,067080682X99,067080682X,9780670806829,bird+kaggle,2024-02-06 01:58:41.719424+00:00
1,6422,069102417099,0691024170,9780691024172,bird+kaggle,2024-02-06 01:58:41.719424+00:00
2,2676,0345355237en['Fiction'],0345355237,9780345355232,bird+kaggle,2024-02-06 01:58:41.719424+00:00
3,3375,0385240899en['Fiction'],0385240899,9780385240895,bird+kaggle,2024-02-06 01:58:41.719424+00:00
4,6067,067972648999,0679726489,9780679726487,bird+kaggle,2024-02-06 01:58:41.719424+00:00
...,...,...,...,...,...,...
3636,2935,0373250746en['Fiction'],0373250746,9780373250745,bird+kaggle,2024-02-06 01:58:41.719424+00:00
3637,3421,0385337930en['Fiction'],0385337930,9780385337939,bird+kaggle,2024-02-06 01:58:41.719424+00:00
3638,8092,0822002426en['Literary Criticism'],0822002426,9780822002420,bird+kaggle,2024-02-06 01:58:41.719424+00:00
3639,3718,039473530799,0394735307,9780394735306,bird+kaggle,2024-02-06 01:58:41.719424+00:00


# Primary Key

In [118]:
%%bigquery
alter table books_stg.ISBN_Convert
  add primary key (book_id, book_key) not enforced

Query is running:   0%|          |

In [119]:
%%bigquery
select book_id, book_key, count(*) as duplicate_pk
from books_stg.ISBN_Convert
group by book_id, book_key
having count(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,book_id,book_key,duplicate_pk


# Foreign Key

In [120]:
%%bigquery
alter table books_stg.ISBN_Convert add foreign key (book_key)
  references books_stg.Reviewed_Book (book_key) not enforced

Query is running:   0%|          |

In [121]:
%%bigquery
alter table books_stg.ISBN_Convert add foreign key (book_id)
  references books_stg.Books (book_id) not enforced

Query is running:   0%|          |

### Foreign Key Violations:

In [122]:
%%bigquery
select count(*) as orphan_records
from books_stg.ISBN_Convert
where book_key not in (select book_key from books_stg.Reviewed_Book )

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0


In [123]:
%%bigquery
select count(*) as orphan_records
from books_stg.ISBN_Convert
where book_id not in (select book_id from books_stg.Books )

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0


# Books in Libraries

We will assign each book with a library location. Since there are some audiobooks within the `Books` table, we make sure to filter out rows with 0 number of pages.

In [124]:
%%bigquery
select * from books_stg.Books;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,book_id,title,isbn13,language_id,num_pages,publication_date,publisher_id,data_source,load_time
0,6594,Point Of Impact (Bob Lee Swagger #1),9780739344248,1,0,2007-01-09,1692,booksdb,2024-02-06 02:03:04.243233+00:00
1,4505,The Mask of the Enchantress,9780449210840,1,0,1981-10-12,1045,booksdb,2024-02-06 02:03:04.243233+00:00
2,5342,Time To Hunt (Bob Lee Swagger #3),9780553455809,1,0,1998-05-18,1642,booksdb,2024-02-06 02:03:04.243233+00:00
3,6879,The Shining,9780743536998,1,0,2005-08-02,1831,booksdb,2024-02-06 02:03:04.243233+00:00
4,6016,Waterworks,9780679433729,1,0,1994-05-31,1642,booksdb,2024-02-06 02:03:04.243233+00:00
...,...,...,...,...,...,...,...,...,...
11122,8797,The Return of the Dancing Master,9781400076956,23,406,2005-02-08,2121,booksdb,2024-02-06 02:03:04.243233+00:00
11123,8624,In Wonderland,9780970312556,24,185,2003-09-01,1009,booksdb,2024-02-06 02:03:04.243233+00:00
11124,10854,Harry Potter ve Sırlar Odası (Harry Potter #2),9783570211021,25,403,2001-10-01,2234,booksdb,2024-02-06 02:03:04.243233+00:00
11125,9702,Harry Potter and the Philosopher's Stone (Harr...,9781582346816,26,250,2010-07-01,294,booksdb,2024-02-06 02:03:04.243233+00:00


In [125]:
%%bigquery
select * from books_stg.Libraries;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,library_id,stabr,libname,address,city,zip,zip4,cnty,full_address,data_source,load_time
0,36,AK,KEGOAYAH KOZGA LIBRARY,100 WEST 7TH AVENUE,NOME,99762,M,NOME,"100 WEST 7TH AVENUE NOME, AK 99762-M",librarydb,2024-01-29 23:59:06.732594+00:00
1,44,AK,SITKA PUBLIC LIBRARY,320 HARBOR DRIVE,SITKA,99835,7553,SITKA,"320 HARBOR DRIVE SITKA, AK 99835-7553",librarydb,2024-01-29 23:59:06.732594+00:00
2,67,AK,ANIAK PUBLIC LIBRARY,270 RIVERFRONT DRIVE,ANIAK,99557,M,BETHEL,"270 RIVERFRONT DRIVE ANIAK, AK 99557-M",librarydb,2024-01-29 23:59:06.732594+00:00
3,4,AK,KUSKOKWIM CONSORTIUM LIBRARY,420 CHIEF EDDIE HOFFMAN HIGHWAY,BETHEL,99559,M,BETHEL,"420 CHIEF EDDIE HOFFMAN HIGHWAY BETHEL, AK 995...",librarydb,2024-01-29 23:59:06.732594+00:00
4,3,AK,ANDERSON COMMUNITY LIBRARY,101 FIRST STREET,ANDERSON,99744,M,DENALI,"101 FIRST STREET ANDERSON, AK 99744-M",librarydb,2024-01-29 23:59:06.732594+00:00
...,...,...,...,...,...,...,...,...,...,...,...
9210,9199,WY,SHERIDAN COUNTY LIBRARY SYSTEM,335 WEST ALGER ST,SHERIDAN,82801,3824,SHERIDAN,"335 WEST ALGER ST SHERIDAN, WY 82801-3824",librarydb,2024-01-29 23:59:06.732594+00:00
9211,9211,WY,SUBLETTE COUNTY LIBRARY SYSTEM,155 S TYLER ST,PINEDALE,82941,5378,SUBLETTE,"155 S TYLER ST PINEDALE, WY 82941-5378",librarydb,2024-01-29 23:59:06.732594+00:00
9212,9214,WY,WASHAKIE COUNTY LIBRARY,"801 BIG HORN AVE., SUITE 100",WORLAND,82401,2703,WASHAKIE,"801 BIG HORN AVE., SUITE 100 WORLAND, WY 82401...",librarydb,2024-01-29 23:59:06.732594+00:00
9213,9200,WY,SWEETWATER COUNTY LIBRARY SYSTEM,300 N. 1ST EAST,GREEN RIVER,82935,4221,SWEETWATER,"300 N. 1ST EAST GREEN RIVER, WY 82935-4221",librarydb,2024-01-29 23:59:06.732594+00:00


In [126]:
from google.cloud import bigquery

# Assuming you have the necessary functions (create_table, remove_none_values, load_records) defined

bq_client = bigquery.Client()

lib_records = []
library_ids = []

project_id = "aaron-nayoung"
stg_dataset_name = "books_stg"
stg_table_name = "Library_Books"
target_table_id = "{}.{}.{}".format(project_id, stg_dataset_name, stg_table_name)

schema = [
    bigquery.SchemaField("book_id", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("library_id", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("data_source", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_table(target_table_id, schema)

del schema[-1]

libraries_sql = 'SELECT library_id FROM {}.Libraries'.format(stg_dataset_name)
query_job = bq_client.query(libraries_sql)

# add the library ids to a list
for row in query_job:
    library_ids.append(row["library_id"])

books_sql = '''SELECT book_id FROM {}.Books WHERE num_pages > 0'''.format(stg_dataset_name)
query_job = bq_client.query(books_sql)

index = 0

# for each book
for row in query_job:
    book_id = row["book_id"]

    if index >= len(library_ids):
        break

    library_id = library_ids[index]

    record = {"book_id": book_id, "library_id": library_id, "data_source": "bird_librarydb"}
    lib_records.append(record)

    index += 1
load_records(target_table_id, schema, lib_records)

Created table Library_Books
Loaded 9215 rows.


In [127]:
%%bigquery
select *
from books_stg.Library_Books
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,book_id,library_id,data_source,load_time
0,6520,36,bird_librarydb,2024-02-13 02:09:30.304325+00:00
1,8974,44,bird_librarydb,2024-02-13 02:09:30.304325+00:00
2,2937,67,bird_librarydb,2024-02-13 02:09:30.304325+00:00
3,3408,4,bird_librarydb,2024-02-13 02:09:30.304325+00:00
4,9807,3,bird_librarydb,2024-02-13 02:09:30.304325+00:00


In [128]:
%%bigquery
select count(*) as num_records
from books_stg.Library_Books

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,9215


# Primary Keys

In [129]:
%%bigquery
alter table books_stg.Library_Books
  add primary key (book_id, library_id) not enforced

Query is running:   0%|          |

In [130]:
%%bigquery
select book_id, library_id, count(*) as duplicate_pk
from books_stg.Library_Books
group by book_id, library_id
having count(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,book_id,library_id,duplicate_pk


# Foreign Keys

In [131]:
%%bigquery
alter table books_stg.Library_Books add foreign key (book_id)
  references books_stg.Books (book_id) not enforced

Query is running:   0%|          |

In [132]:
%%bigquery
alter table books_stg.Library_Books add foreign key (library_id)
  references books_stg.Libraries (library_id) not enforced

Query is running:   0%|          |

### Foreign Key Violations:

In [133]:
%%bigquery
select count(*) as orphan_records
from books_stg.Library_Books
where book_id not in (select book_id from books_stg.Books )

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0


In [134]:
%%bigquery
select count(*) as orphan_records
from books_stg.Library_Books
where library_id not in (select library_id from books_stg.Libraries )

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0
