# Data Acquisition

In [1]:
# data manipulation
import numpy as np
import pandas as pd

# visualization
import matplotlib as plt
import seaborn as sns

# web scraping
import requests
from bs4 import BeautifulSoup
from time import sleep

## 1. Review Data

### 1.1 URLs

Basenotes.net has 5,315 pages of fragrance reviews. To scrape review data from these pages I first need a list of URLs, one for each page of reviews.

In [2]:
# URL without the page numbers
base_url = 'http://www.basenotes.net/fragrancereviews/page/'

# range of page numbers
page_range = range(1, 5316)

In [3]:
def create_url_list(base_url, id_list):
    '''Create a list of URLs to be scraped.'''
    url_list = []
    
    for id in id_list:
        url = base_url + str(id)
        url_list.append(url)
        
    return url_list

In [5]:
# loop through page numbers and create a list of URLs
url_list = create_url_list(base_url, page_range)

In [6]:
# check for correct number of URLs
len(url_list)

5315

### 1.2 Soup

In [7]:
def urls_to_soups(url_list):
    '''Parse soup from each URL and append to a list.'''
    soup_list = []
    
    for url in url_list:
        response = requests.get(url)
    
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            soup_list.append(soup)
            sleep(7)
        else:
            print(response.status_code)
    
    return soup_list

In [8]:
soup_list = urls_to_soups(url_list)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


503
503
503
503


In [10]:
len(soup_list)

5311

In [28]:
def soups_to_dataframe(soup_list):
    '''Parse data from soup for each review page and return a dataframe.'''
    for soup in soup_list:
        # if the first page of reviews, parse data, and add to a new dataframe
        if soup_list.index(soup) == 0:
            
            # parse reviews from first page
            reviews = soup.find_all('div', {'class': 'reviewmain'})
    
            reviews_list = []
    
            for review in reviews:
                # create a dictionary to store data from each review
                review_data_dict = {}

                # parse data from review class
                rating_value = review['class'][-1][-1]
                review_id = review['id'].split('-')[-1]

                # parse reivewauthor class
                user_section = review.find('div', {'class': 'reviewauthor'})

                # parse data from user_section
                user_id = user_section.find('a')['href'].split('/')[-1]
                user_name = user_section.text.split(' Show all reviews ')[0]
                user_location = user_section.text.split(' Show all reviews ')[-1]

                # parse reviewblurb class
                blurb_section = review.find('div', {'class': 'reviewblurb'})

                # parse data from blurb_section
                scent_id = blurb_section.find('a')['href'].split('ID')[-1].split('.')[0]
                scent_name = blurb_section.find('a').text.split(' by ')[0]
                scent_brand = blurb_section.find('a').text.split(' by ')[-1]
                review_text = blurb_section.text.split(scent_brand, 1)[-1]

                # add data for each field to the dictionary
                review_data_dict['rating_value'] = rating_value
                review_data_dict['review_id'] = review_id
                review_data_dict['user_id'] = user_id
                review_data_dict['user_name'] = user_name
                review_data_dict['user_location'] = user_location
                review_data_dict['scent_id'] = scent_id
                review_data_dict['scent_name'] = scent_name
                review_data_dict['scent_brand'] = scent_brand
                review_data_dict['review_text'] = review_text

                #append dictionary to list
                reviews_list.append(review_data_dict)
        
            
            # create a dataframe from the list of dictionaries
            reviews_df = pd.DataFrame(reviews_list)
            
        # for each additional page, parse the data and append to the dataframe
        else:
                
            # parse reviews from first page
            reviews = soup.find_all('div', {'class': 'reviewmain'})
    
            reviews_list = []
    
            for review in reviews:
                # create a dictionary to store data from each review
                review_data_dict = {}

                # parse data from review class
                rating_value = review['class'][-1][-1]
                review_id = review['id'].split('-')[-1]

                # parse reivewauthor class
                user_section = review.find('div', {'class': 'reviewauthor'})

                # parse data from user_section
                user_id = user_section.find('a')['href'].split('/')[-1]
                user_name = user_section.text.split(' Show all reviews ')[0]
                user_location = user_section.text.split(' Show all reviews ')[-1]

                # parse reviewblurb class
                blurb_section = review.find('div', {'class': 'reviewblurb'})

                # parse data from blurb_section
                scent_id = blurb_section.find('a')['href'].split('ID')[-1].split('.')[0]
                scent_name = blurb_section.find('a').text.split(' by ')[0]
                scent_brand = blurb_section.find('a').text.split(' by ')[-1]
                review_text = blurb_section.text.split(scent_brand, 1)[-1]

                # add data for each field to the dictionary
                review_data_dict['rating_value'] = rating_value
                review_data_dict['review_id'] = review_id
                review_data_dict['user_id'] = user_id
                review_data_dict['user_name'] = user_name
                review_data_dict['user_location'] = user_location
                review_data_dict['scent_id'] = scent_id
                review_data_dict['scent_name'] = scent_name
                review_data_dict['scent_brand'] = scent_brand
                review_data_dict['review_text'] = review_text
                
                #append dictionary to list
                reviews_list.append(review_data_dict)
        
            # append data to dataframe
            reviews_df = reviews_df.append(reviews_list, ignore_index=True)
        
    return reviews_df

In [29]:
reviews_df = soups_to_dataframe(soup_list)

In [24]:
# save dataframe to CSV
reviews_df.to_csv('basenotes_reviews_df.csv', index=False)