In [215]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine

In [216]:
# Store filepath in a variable
goodreads_csv = 'Output/paul.csv'
amazon_csv = 'Output/2019_Amazon-Best-Sellers_Author-BookTitles.csv'

In [217]:
# Read our Data file with the pandas library
# Not every CSV requires an encoding, but be aware this can come up
goodreads_orig = pd.read_csv(goodreads_csv, encoding='ISO-8859-1')
amazon_orig = pd.read_csv(amazon_csv, encoding='ISO-8859-1')
amazon_orig.head()

Unnamed: 0.1,Unnamed: 0,First,Last,Book_Title
0,0,Bill,Martin Jr.,"Brown Bear, Brown Bear, What Do You See?"
1,1,Dav,Pilkey,Dog Man: Brawl of the Wild: From the Creator o...
2,2,Dav,Pilkey,Dog Man: Fetch-22: From the Creator of Captain...
3,3,Dav,Pilkey,Dog Man: For Whom the Ball Rolls: From the Cre...
4,4,Giles,Andreae,Giraffes Can't Dance


In [218]:
# Show just the header
goodreads_orig.head()
amazon = amazon_orig[['Book_Title', 'First', 'Last']]
amazon['Rating'] = 0
amazon['source'] = 'Amazon'

amazon = amazon.rename(columns={'Book_Title': 'Title'})
amazon.head()

Unnamed: 0,Title,First,Last,Rating,source
0,"Brown Bear, Brown Bear, What Do You See?",Bill,Martin Jr.,0,Amazon
1,Dog Man: Brawl of the Wild: From the Creator o...,Dav,Pilkey,0,Amazon
2,Dog Man: Fetch-22: From the Creator of Captain...,Dav,Pilkey,0,Amazon
3,Dog Man: For Whom the Ball Rolls: From the Cre...,Dav,Pilkey,0,Amazon
4,Giraffes Can't Dance,Giles,Andreae,0,Amazon


In [219]:
goodreads = goodreads_orig.copy()
goodreads.at[3,'Author'] = 'Mary-Beth Keane'
goodreads[['First','Last']] = goodreads.Author.str.split(expand=True)
goodreads_drop = goodreads[['Title', 'First', 'Last', 'Rating']]
goodreads_drop['source'] = 'Goodreads'
total = goodreads_drop.copy()
total = total.append(amazon)

total.head()

Unnamed: 0,Title,First,Last,Rating,source
0,The Testaments,Margaret,Atwood,4.2,Goodreads
1,Normal People,Sally,Rooney,3.86,Goodreads
2,Where the Forest Meets the Stars,Glendy,Vanderah,4.12,Goodreads
3,"Ask Again, Yes",Mary-Beth,Keane,3.97,Goodreads
4,Queenie,Candice,Carty-Williams,3.88,Goodreads


In [220]:
# Show mulitple specific columns--note the extra brackets
author = pd.DataFrame(total.groupby(['Last','First']).count())
author = author.reset_index()
author_final = author[['First', 'Last']]
author_final = author_final.reset_index()
author_final = author_final.rename(columns={'index': 'author_id', 'First': 'first_name', 'Last': 'last_name'})
author_final.head()


Unnamed: 0,author_id,first_name,last_name
0,0,AndrÃ©,Aciman
1,1,Giles,Andreae
2,2,Margaret,Atwood
3,3,Taffy,Brodesser-Akner
4,4,Eric,Carle


In [221]:
# Test replace
books = total.copy()
for name in author_final.index:
    books = books.replace(to_replace=author_final.iloc[name]['last_name'], value=author_final.loc[name]['author_id'])


books_final = books[['Title', 'Last']]
books_final = books_final.reset_index(drop=True)
books_final = books_final.reset_index()
books_final = books_final.rename(columns={'index': 'book_id', 'Title': 'title','Last': 'author_fk'})
books_final.head()

Unnamed: 0,book_id,title,author_fk
0,0,The Testaments,2
1,1,Normal People,25
2,2,Where the Forest Meets the Stars,31
3,3,"Ask Again, Yes",13
4,4,Queenie,5


In [222]:
source = pd.DataFrame()
source['source_name'] = ['Goodreads', 'Amazon','New York Times']
source['type'] = ['User Rating', 'User Rating', 'Bestseller']
source_final = source.reset_index()
source_final = source_final.rename(columns={'index': 'source_id'})

source_final.head()

Unnamed: 0,source_id,source_name,type
0,0,Goodreads,User Rating
1,1,Amazon,User Rating
2,2,New York Times,Bestseller


In [223]:
review = total.copy()
for name in source_final.index:
    review = review.replace(to_replace=source_final.iloc[name]['source_name'], value=source_final.loc[name]['source_id'])

for name in books_final.index:
     review = review.replace(to_replace=books_final.iloc[name]['title'], value=books_final.loc[name]['book_id'])
review_final = review[['Title', 'Rating', 'source']]
review_final = review_final.reset_index(drop=True)
review_final = review_final.reset_index()
review_final = review_final.rename(columns={'index': 'review_id','Title': 'book_fk','source': 'source_fk', 'Rating': 'rating'})
review_final.head()

Unnamed: 0,review_id,book_fk,rating,source_fk
0,0,0,4.2,0
1,1,1,3.86,0
2,2,2,4.12,0
3,3,3,3.97,0
4,4,4,3.88,0
5,5,5,4.04,0
6,6,6,4.25,0
7,7,7,3.82,0
8,8,8,4.11,0
9,9,9,3.96,0


In [224]:
rds_connection_string = "postgres:postgres@localhost:5432/ETL_Project"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [225]:
engine.table_names()

[]

In [226]:
author_final.to_sql(name='author', con=engine, if_exists='append', index=False)
books_final.to_sql(name='book', con=engine, if_exists='append', index=False)
source_final.to_sql(name='source', con=engine, if_exists='append', index=False)
review_final.to_sql(name='review', con=engine, if_exists='append', index=False)


