In [1]:
%matplotlib inline

In [2]:
# my imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Scraping data for bestseller books from Wikipedia

Add base URL - the page URL. For this project I'm using the 'List of best-selling books' from Wikipedia

In [3]:
base_url = 'https://en.wikipedia.org/wiki/List_of_best-selling_books'

Pull HTML from site using GET request and create our BS object from it.

In [4]:
page = requests.get(base_url)

soup = BeautifulSoup(page.text, 'html')

In [5]:
tables = soup.find_all('table', class_ = 'wikitable')

In [6]:
len(tables)

14

In [7]:
bestsellers_books = tables[:3]

In [8]:
del tables

In [9]:
headers = bestsellers_books[0].find_all('th')

In [10]:
header_labels = [label.text.strip() for label in headers]

In [11]:
header_labels

['Book',
 'Author(s)',
 'Original language',
 'First published',
 'Approximate sales',
 'Genre']

In [12]:
books_df = pd.DataFrame(columns=header_labels)

In [13]:
books_df

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales,Genre


In [14]:
table_rows = [] 
for book in bestsellers_books:
    table_rows.extend(book.find_all('tr')[1:])

In [15]:
for row in table_rows:
    current_row = row.find_all('td')
    row_data = [cell.text.strip() for cell in current_row]
    
    books_df.loc[len(books_df)] = row_data

In [16]:
books_df

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales,Genre
0,A Tale of Two Cities,Charles Dickens,English,1859,>200 million[20],Historical fiction
1,The Little Prince (Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200 million[21][22][23],"Fantasy, children's fiction"
2,The Alchemist (O Alquimista),Paulo Coelho,Portuguese,1988,150 million[24][25],Fantasy
3,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120 million[26][27],"Fantasy, children's fiction"
4,And Then There Were None,Agatha Christie,English,1939,100 million[28],Mystery
...,...,...,...,...,...,...
96,Fear of Flying,Erica Jong,English,1973,20 million[119],Romantic novel
97,Dune,Frank Herbert,English,1965,20 million[120],Science fiction novel
98,Charlie and the Chocolate Factory,Roald Dahl,English,1964,20 million[121],Children's fantasy novel
99,The Naked Ape,Desmond Morris,English,1968,20 million[122],"Social Science, Anthropology, Psychology"


In [17]:
books_df[books_df['Approximate sales'].str.contains('million')].shape

(101, 6)

All values start with number followed by 'million'

In [18]:
books_df['Approximate sales'] = books_df['Approximate sales'].str.extract(r'(>?[\d]+\s*million)', expand=False)

In [19]:
books_df.head()

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales,Genre
0,A Tale of Two Cities,Charles Dickens,English,1859,>200 million,Historical fiction
1,The Little Prince (Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200 million,"Fantasy, children's fiction"
2,The Alchemist (O Alquimista),Paulo Coelho,Portuguese,1988,150 million,Fantasy
3,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120 million,"Fantasy, children's fiction"
4,And Then There Were None,Agatha Christie,English,1939,100 million,Mystery


In [24]:
books_df.columns = ['book_title', 'author', 'original_language', 'first_published_year',
       'approximate_sales', 'genre']

In [28]:
books_df = books_df[['first_published_year', 'book_title', 'author', 'original_language',
       'genre', 'approximate_sales']]

In [29]:
books_df.to_csv('data/cleaned_bestsellers_approximated_sales.csv',index=None)