In [19]:
# Dependencies 
import os
from bs4 import BeautifulSoup as bs
import requests
import pymongo
from splinter import Browser
import pandas as pd
from flask import Flask, redirect, render_template, jsonify
import time
from selenium import webdriver
import lxml
import urllib

In [20]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [21]:
# Define database and collection 
db = client.quotes
collection = db.quotes

In [22]:
# ENTIRE Quote List 
total_quotes=[]
# Counter
__id = 0 

for r in range(1,11):
    url=f'http://quotes.toscrape.com/page/{r}'
    response=requests.get(url)
    soup=bs(response.text, "lxml")

    ascrape=soup.find_all('div',{'class':'quote'})
    for a in ascrape:
        # ID Number 
        __id += 1 
        
        # Scrape Quote 
        quote=a.find('span', class_='text').text
        
        # Scrape Author Name 
        author_name=a.find('small', class_='author').text
        # Scrape Tags 
        tags_list=[]
        tags=a.find('div', class_='tags').find_all('a')
        for tag in tags:
            tag_text=tag.text.strip()
            tags_list.append(tag_text)
            
        # New URL for author details 
        href = a.a["href"]
        author_url = f'http://quotes.toscrape.com{href}'
        author_response = requests.get(author_url)
        author_soup = bs(author_response.text, 'lxml')
        
        # Scrape Author Details (Birthdate and Description)
        author_born = author_soup.find('span', class_ = 'author-born-date').text
        author_desrp = author_soup.find('div', class_ = 'author-description').text
        
        # Author Details Dictionary 
        author = {
            "name": author_name,
            "birthdate": author_born,
            "description": author_desrp
        }
        
        # ENTIRE Quote Dictionary to be inserted into MongoDB
        data = {
            '__id': __id,
            'quote': quote,
            'author': author,
            'tags': tags_list,
        }
        total_quotes.append(data)
        
        # Insert dictionary into MongoDB as a document 
        collection.insert_one(data)


In [23]:
import pandas as pd
from sqlalchemy import create_engine

In [24]:
# store csv into df
csv_file = "resources/quotes.csv"
quotes_df = pd.read_csv(csv_file)
quotes_df.head()

Unnamed: 0,_id,__id,quote,author.name,author.birthdate,author.description,tags
0,5fc9893f8ed0f58be90cf910,1,“The world as we have created it is a process ...,Albert Einstein,"March 14, 1879","\n In 1879, Albert Einstein was born in...","[""change"",""deep-thoughts"",""thinking"",""world""]"
1,5fc9893f8ed0f58be90cf911,2,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"July 31, 1965",\n See also: Robert GalbraithAlthough s...,"[""abilities"",""choices""]"
2,5fc989408ed0f58be90cf912,3,“There are only two ways to live your life. On...,Albert Einstein,"March 14, 1879","\n In 1879, Albert Einstein was born in...","[""inspirational"",""life"",""live"",""miracle"",""mira..."
3,5fc989408ed0f58be90cf913,4,"“The person, be it gentleman or lady, who has ...",Jane Austen,"December 16, 1775",\n Jane Austen was an English novelist ...,"[""aliteracy"",""books"",""classic"",""humor""]"
4,5fc989408ed0f58be90cf914,5,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"June 01, 1926",\n Marilyn Monroe (born Norma Jeane Mor...,"[""be-yourself"",""inspirational""]"


In [25]:
# review columns
quotes_df.columns

Index(['_id', '__id', 'quote', 'author.name', 'author.birthdate',
       'author.description', 'tags'],
      dtype='object')

In [26]:
# clean df
quotes_df.columns = ['quote_id', 'id', 'text', 'name', 'born',
       'description', 'tags']

In [27]:
# select columns for new quotes df
new_quotes_df = quotes_df[['id', 'name', 'text']].copy()
new_quotes_df.head()

Unnamed: 0,id,name,text
0,1,Albert Einstein,“The world as we have created it is a process ...
1,2,J.K. Rowling,"“It is our choices, Harry, that show what we t..."
2,3,Albert Einstein,“There are only two ways to live your life. On...
3,4,Jane Austen,"“The person, be it gentleman or lady, who has ..."
4,5,Marilyn Monroe,"“Imperfection is beauty, madness is genius and..."


In [28]:
# select columns for new author info df
new_author_df = quotes_df[['id', 'name', 'born', 'description']].copy()
new_author_df.head()

Unnamed: 0,id,name,born,description
0,1,Albert Einstein,"March 14, 1879","\n In 1879, Albert Einstein was born in..."
1,2,J.K. Rowling,"July 31, 1965",\n See also: Robert GalbraithAlthough s...
2,3,Albert Einstein,"March 14, 1879","\n In 1879, Albert Einstein was born in..."
3,4,Jane Austen,"December 16, 1775",\n Jane Austen was an English novelist ...
4,5,Marilyn Monroe,"June 01, 1926",\n Marilyn Monroe (born Norma Jeane Mor...


In [29]:
# select columns for new tags df
new_tag_df = quotes_df[['id', 'quote_id', 'tags']].copy()
new_tag_df.head()

Unnamed: 0,id,quote_id,tags
0,1,5fc9893f8ed0f58be90cf910,"[""change"",""deep-thoughts"",""thinking"",""world""]"
1,2,5fc9893f8ed0f58be90cf911,"[""abilities"",""choices""]"
2,3,5fc989408ed0f58be90cf912,"[""inspirational"",""life"",""live"",""miracle"",""mira..."
3,4,5fc989408ed0f58be90cf913,"[""aliteracy"",""books"",""classic"",""humor""]"
4,5,5fc989408ed0f58be90cf914,"[""be-yourself"",""inspirational""]"


In [39]:
# connect to postgresql
rds_connection_string = "postgres://lptdkrurwitgwv:3c931e0ab2fd366ac76902898dacfe8e0eeeabc5b2b222b12a48cefbc52c3fa5@ec2-34-237-236-32.compute-1.amazonaws.com:5432/d7e1q92pf06r6o"
engine = create_engine(f'postgresql://lptdkrurwitgwv:3c931e0ab2fd366ac76902898dacfe8e0eeeabc5b2b222b12a48cefbc52c3fa5@ec2-34-237-236-32.compute-1.amazonaws.com:5432/d7e1q92pf06r6o')

In [40]:
# load quotes df into postgresql
new_quotes_df.to_sql(name='quotes', con=engine, if_exists='append', index=False)

In [41]:
# load author df into postgresql
new_author_df.to_sql(name='author', con=engine, if_exists='append', index=False)

In [42]:
# load tags df into postgresql
new_tag_df.to_sql(name='tags', con=engine, if_exists='append', index=False)

In [43]:
# verify author info data is in sql
pd.read_sql_query('select * from author', con=engine).head()

Unnamed: 0,id,name,born,description
0,1,Albert Einstein,"March 14, 1879","\n In 1879, Albert Einstein was born in..."
1,2,J.K. Rowling,"July 31, 1965",\n See also: Robert GalbraithAlthough s...
2,3,Albert Einstein,"March 14, 1879","\n In 1879, Albert Einstein was born in..."
3,4,Jane Austen,"December 16, 1775",\n Jane Austen was an English novelist ...
4,5,Marilyn Monroe,"June 01, 1926",\n Marilyn Monroe (born Norma Jeane Mor...


In [44]:
# verify quote data is in sql
pd.read_sql_query('select * from quotes', con=engine).head()

Unnamed: 0,id,name,text
0,1,Albert Einstein,“The world as we have created it is a process ...
1,2,J.K. Rowling,"“It is our choices, Harry, that show what we t..."
2,3,Albert Einstein,“There are only two ways to live your life. On...
3,4,Jane Austen,"“The person, be it gentleman or lady, who has ..."
4,5,Marilyn Monroe,"“Imperfection is beauty, madness is genius and..."


In [45]:
# verify tags data is in sql
pd.read_sql_query('select * from tags', con=engine).head()

Unnamed: 0,id,quote_id,tags
0,1,5fc9893f8ed0f58be90cf910,"[""change"",""deep-thoughts"",""thinking"",""world""]"
1,2,5fc9893f8ed0f58be90cf911,"[""abilities"",""choices""]"
2,3,5fc989408ed0f58be90cf912,"[""inspirational"",""life"",""live"",""miracle"",""mira..."
3,4,5fc989408ed0f58be90cf913,"[""aliteracy"",""books"",""classic"",""humor""]"
4,5,5fc989408ed0f58be90cf914,"[""be-yourself"",""inspirational""]"
