In [23]:
%matplotlib inline

<a id='scrape'></a>

In [24]:
# my imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Scraping data for bestseller books from Wikipedia

Add base URL - the page URL. For this project I'm using the 'List of best-selling books' from Wikipedia

In [25]:
base_url = 'https://en.wikipedia.org/wiki/List_of_best-selling_books'

Pull HTML from site using GET request and create our BS object from it.

In [26]:
page = requests.get(base_url)

soup = BeautifulSoup(page.text, 'html')

In [27]:
tables = soup.find_all('table', class_ = 'wikitable')

In [28]:
len(tables)

14

In [29]:
bestsellers_books = tables[:3]

In [30]:
del tables

In [31]:
headers = bestsellers_books[0].find_all('th')

In [32]:
header_labels = [label.text.strip() for label in headers]

In [33]:
header_labels

['Book',
 'Author(s)',
 'Original language',
 'First published',
 'Approximate sales',
 'Genre']

In [34]:
books_df = pd.DataFrame(columns=header_labels)

In [35]:
books_df

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales,Genre


In [36]:
table_rows = [] 
for book in bestsellers_books:
    table_rows.extend(book.find_all('tr')[1:])

In [37]:
for row in table_rows:
    current_row = row.find_all('td')
    row_data = [cell.text.strip() for cell in current_row]
    
    books_df.loc[len(books_df)] = row_data

In [38]:
books_df

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales,Genre
0,A Tale of Two Cities,Charles Dickens,English,1859,>200 million[20],Historical fiction
1,The Little Prince (Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200 million[21][22][23],"Fantasy, children's fiction"
2,The Alchemist (O Alquimista),Paulo Coelho,Portuguese,1988,150 million[24][25],Fantasy
3,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120 million[26][27],"Fantasy, children's fiction"
4,And Then There Were None,Agatha Christie,English,1939,100 million[28],Mystery
...,...,...,...,...,...,...
97,Fear of Flying,Erica Jong,English,1973,20 million[120],Romantic novel
98,Dune,Frank Herbert,English,1965,20 million[121],Science fiction novel
99,Charlie and the Chocolate Factory,Roald Dahl,English,1964,20 million[122],Children's fantasy novel
100,The Naked Ape,Desmond Morris,English,1968,20 million[123],"Social Science, Anthropology, Psychology"


In [39]:
books_df[books_df['Approximate sales'].str.contains('million')].shape

(102, 6)

All values start with number followed by 'million'

In [40]:
books_df['Approximate sales'] = books_df['Approximate sales'].str.extract(r'([\d]+)', expand=False).astype('int')

In [41]:
books_df.head()

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales,Genre
0,A Tale of Two Cities,Charles Dickens,English,1859,200,Historical fiction
1,The Little Prince (Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200,"Fantasy, children's fiction"
2,The Alchemist (O Alquimista),Paulo Coelho,Portuguese,1988,150,Fantasy
3,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120,"Fantasy, children's fiction"
4,And Then There Were None,Agatha Christie,English,1939,100,Mystery


In [42]:
books_df.columns = ['book_title', 'author', 'original_language', 'first_published_year',
       'approximate_sales_in_millions', 'genre']

In [43]:
books_df = books_df[['first_published_year', 'book_title', 'author', 'original_language',
       'genre', 'approximate_sales_in_millions']]

In [44]:
books_df.to_csv('data/cleaned_bestsellers_approximated_sales.csv',index=None)

In [45]:
books_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
approximate_sales_in_millions,102.0,42.960784,33.215743,20.0,21.0,30.5,50.0,200.0
