In [1]:
from requests_html import HTMLSession, HTML
import numpy as np
import pandas as pd
from datetime import datetime
import nest_asyncio

In [2]:
url = 'https://www.audible.com/search?pf_rd_p=1d79b443-2f1d-43a3-b1dc-31a2cd242566&pf_rd_r=HK8P1MY097JB8VJ6PRTQ&ref=a_search_c4_pageSize_3&keywords=the+great+courses&pageSize=50'

In [3]:
def scrape_great_courses(url):
    sess = HTMLSession()
    data = []
    pages = 15
    for page in range(pages):
        pageurl = url + '&page=' + str(page+1)
        r = sess.get(pageurl)
        items = r.html.find('li.bc-list-item.productListItem', first=False)
        for i, item in enumerate(items):
            text_fields = item.text.split('\n')
            dict_entry={
                'title' : text_fields[0],
                'price' : np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1]),
                'length' : [s for s in text_fields if 'Length' in s][0].split(': ')[1],
                'rating' : np.float([s for s in text_fields if 'stars' in s][0].split(' out')[0]),
                'rating_count' : np.int([s for s in text_fields if 'stars' in s][0].split(
                    'stars ')[1].replace(',','')),
                'link' : 'https://www.audible.com' + [link for link in item.links if '/pd/' in link][0],
                }
            if any(['Series:' in s for s in text_fields]):
                dict_entry['series'] = [s for s in text_fields if 'Series:' in s][0].split('Series: ')[1]
            else:
                dict_entry['series'] = 'N/A'
            if any(['Release date:' in s for s in text_fields]):
                dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(
                    ': ')[1], '%m-%d-%y')
            data.append(dict_entry)
    return data

In [4]:
df = pd.DataFrame(data=scrape_great_courses(url))

In [5]:
df.head()

Unnamed: 0,length,link,price,rating,rating_count,release_date,series,title
0,43 hrs and 23 mins,https://www.audible.com/pd/The-History-of-the-...,59.95,4.5,2315,2013-07-08,The Great Courses: Modern History,"The History of the United States, 2nd Edition"
1,12 hrs and 39 mins,https://www.audible.com/pd/Your-Best-Brain-The...,34.95,4.5,2629,2014-11-14,The Great Courses: Psychology,Your Best Brain: The Science of Brain Improvement
2,18 hrs and 15 mins,https://www.audible.com/pd/The-Story-of-Human-...,41.95,4.5,3613,2013-07-08,The Great Courses: Linguistics,The Story of Human Language
3,36 hrs and 34 mins,https://www.audible.com/pd/How-to-Listen-to-an...,59.95,4.5,2337,2013-07-08,The Great Courses: Fine Arts & Music,"How to Listen to and Understand Great Music, 3..."
4,31 hrs and 18 mins,https://www.audible.com/pd/Critical-Business-S...,59.95,4.5,2171,2015-04-08,The Great Courses: Professional,Critical Business Skills for Success


In [6]:
def scrape_sale_courses():
    data = []
    pages = 5
    for page in range(pages):
        pageurl = '/Users/tobymanders/Desktop/{}.html'.format(page+1)
        with open(pageurl) as file:
            html = file.read()
        r = HTML(html=html)
        items = r.find('li.bc-list-item.productListItem', first=False)
        for i, item in enumerate(items):
            text_fields = item.text.split('\n')
            dict_entry={
                'title' : text_fields[0],
                'sale' : 'Yes',
            }
            if any(['Member' in s for s in text_fields]):
                dict_entry['member-price'] = np.float([s for s in text_fields if 'Member' in s][0].split('$')[1].split(' or')[0])
            data.append(dict_entry)
    return data

In [7]:
sale_df = pd.DataFrame(data=scrape_sale_courses())

In [8]:
df_merge = df.join(sale_df.set_index('title'), on='title')

In [9]:
df_merge.head()

Unnamed: 0,length,link,price,rating,rating_count,release_date,series,title,member-price,sale
0,43 hrs and 23 mins,https://www.audible.com/pd/The-History-of-the-...,59.95,4.5,2315,2013-07-08,The Great Courses: Modern History,"The History of the United States, 2nd Edition",,Yes
1,12 hrs and 39 mins,https://www.audible.com/pd/Your-Best-Brain-The...,34.95,4.5,2629,2014-11-14,The Great Courses: Psychology,Your Best Brain: The Science of Brain Improvement,24.46,Yes
2,18 hrs and 15 mins,https://www.audible.com/pd/The-Story-of-Human-...,41.95,4.5,3613,2013-07-08,The Great Courses: Linguistics,The Story of Human Language,,Yes
3,36 hrs and 34 mins,https://www.audible.com/pd/How-to-Listen-to-an...,59.95,4.5,2337,2013-07-08,The Great Courses: Fine Arts & Music,"How to Listen to and Understand Great Music, 3...",41.96,Yes
4,31 hrs and 18 mins,https://www.audible.com/pd/Critical-Business-S...,59.95,4.5,2171,2015-04-08,The Great Courses: Professional,Critical Business Skills for Success,41.96,Yes


In [10]:
df_merge['sale'] = df_merge['sale'].fillna('No')

In [11]:
df_merge.head()

Unnamed: 0,length,link,price,rating,rating_count,release_date,series,title,member-price,sale
0,43 hrs and 23 mins,https://www.audible.com/pd/The-History-of-the-...,59.95,4.5,2315,2013-07-08,The Great Courses: Modern History,"The History of the United States, 2nd Edition",,Yes
1,12 hrs and 39 mins,https://www.audible.com/pd/Your-Best-Brain-The...,34.95,4.5,2629,2014-11-14,The Great Courses: Psychology,Your Best Brain: The Science of Brain Improvement,24.46,Yes
2,18 hrs and 15 mins,https://www.audible.com/pd/The-Story-of-Human-...,41.95,4.5,3613,2013-07-08,The Great Courses: Linguistics,The Story of Human Language,,Yes
3,36 hrs and 34 mins,https://www.audible.com/pd/How-to-Listen-to-an...,59.95,4.5,2337,2013-07-08,The Great Courses: Fine Arts & Music,"How to Listen to and Understand Great Music, 3...",41.96,Yes
4,31 hrs and 18 mins,https://www.audible.com/pd/Critical-Business-S...,59.95,4.5,2171,2015-04-08,The Great Courses: Professional,Critical Business Skills for Success,41.96,Yes


In [12]:
def fix_mins(x):
    if 'min' in x and 'hr' in x:
        hrs = x.split(' hr')[0]
        mins = x.split(' min')[0].split('and ')[1]
        len_ = make_len(hrs, mins)
    elif 'min' in x:
        mins = x.split(' min')[0]
        len_ = make_len('00', mins)
    elif 'hr' in x:
        hrs = x.split(' hr')[0]
        len_ = make_len(hrs, '00')
    else:
        len_ = make_len('00', '00')
    return len_

def make_len(hrs, mins):
    if len(hrs)<2:
        hrs = '0' + hrs
    if len(mins)<2:
        mins = '0' + mins
    return hrs + ' hrs ' + mins + ' mins'

In [13]:
df_merge['length'] = df_merge['length'].apply(fix_mins)

In [14]:
columns = ['title', 'sale', 'price', 'member-price', 'length', 'rating', 'rating_count', 'release_date', 'series', 'link']

In [15]:
df_merge.to_csv('great_courses_all_titles_v2.csv', columns=columns, index=False)

In [16]:
url_list = list(df_merge['link'])

In [17]:
len(df_merge)

724

In [50]:
def get_accurate_ratings(addresses, rating_dict):
    for address in addresses:
        sess = HTMLSession()
        r = sess.get(address)
        try:
            rating_dict[address] = np.float(r.html.find('div.bc-row.bc-spacing-small', first=False)[1].text.split(' stars ')[1][:3])
        except:
            rating_dict[address] = np.nan
        
    return rating_dict

In [51]:
from threading import Thread

def threaded_ratings(nthreads, addresses, rating_dict=None):
    if rating_dict == None:
        rating_dict = {}
    threads = []
    for i in range(nthreads):
        add_subset = addresses[i::nthreads]
        t = Thread(target=get_accurate_ratings, args=(add_subset, rating_dict))
        threads.append(t)
    [t.start() for t in threads]
    [t.join() for t in threads]
    
    return rating_dict

In [52]:
rating_dict = threaded_ratings(64, url_list)

In [58]:
df_merge['rating'] = df_merge['link'].apply(lambda x: rating_dict[x])

In [59]:
df_merge.head()

Unnamed: 0,length,link,price,rating,rating_count,release_date,series,title,member-price,sale
0,43 hrs 23 mins,https://www.audible.com/pd/The-History-of-the-...,59.95,4.7,2315,2013-07-08,The Great Courses: Modern History,"The History of the United States, 2nd Edition",,Yes
1,12 hrs 39 mins,https://www.audible.com/pd/Your-Best-Brain-The...,34.95,4.5,2629,2014-11-14,The Great Courses: Psychology,Your Best Brain: The Science of Brain Improvement,24.46,Yes
2,18 hrs 15 mins,https://www.audible.com/pd/The-Story-of-Human-...,41.95,4.7,3613,2013-07-08,The Great Courses: Linguistics,The Story of Human Language,,Yes
3,36 hrs 34 mins,https://www.audible.com/pd/How-to-Listen-to-an...,59.95,4.7,2337,2013-07-08,The Great Courses: Fine Arts & Music,"How to Listen to and Understand Great Music, 3...",41.96,Yes
4,31 hrs 18 mins,https://www.audible.com/pd/Critical-Business-S...,59.95,4.6,2171,2015-04-08,The Great Courses: Professional,Critical Business Skills for Success,41.96,Yes


In [62]:
df_merge.to_csv('great_courses_list_v3.csv', columns=columns, index=False)