In [13]:
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import requests as req
import time

In [14]:
url = 'https://wheeloffortuneanswer.com/'

In [15]:
def get_request_with_delay(url, delay):
    time.sleep(delay)
    res = req.get(url)
    return res.text
    
    
def html_to_soup(html):
    soup = bs(html)
    return soup


def get_request_to_soup(url, delay):
    html = get_request_with_delay(url, delay)
    soup = html_to_soup(html)
    return soup


def get_wof_category_endpoints(url, delay):
    soup = get_request_to_soup(url, delay)
    table = soup.find(id='tablepress-73')
    tbody = table.tbody
    tds = tbody.findAll('td', class_ = 'column-1')
    category_endpoints = [td.a['href'] for td in tds]
    
    return category_endpoints

In [16]:
category_endpoints = get_wof_category_endpoints(url, 5)

In [None]:
# Test Block
# @TODO: On line 6, loop over category_endpoints after sample testing has been completed

categories_and_phrases = []

for n in range(1):

    endpoint = category_endpoints[n]
    soup = get_request_to_soup(endpoint, 2)
    category = soup.find('h1', class_ ='page-title').text
    table = soup.find(id='tablepress-1')
    tbody = table.tbody
    rows = tbody.findAll('tr')
    
    category_and_phrases = {
        'category': category,
        'phrases': []
    }
    
    for row in rows:
        
        tds = row.findAll('td')
        phrase = None
        number_of_words = None
        total_number_of_letters = None
        first_word_letters = None
        
        try:
            phrase = tds[0].text
        except:
            phrase = np.nan
                
        try:
            number_of_words = tds[1].text
        except:
            number_of_words = np.nan
                
        try:
            total_number_of_letters = tds[2].text
        except:
            total_number_of_letters = np.nan
                
        try:
            first_word_letters = tds[3].text
        except:
            first_word_letters = np.nan
        
        phrase_metadata = {
            'category': category,
            'phrase': phrase,
            'number_of_words': number_of_words,
            'total_number_of_letters': total_number_of_letters,
            'first_word_letters': first_word_letters            
        }
        
        category_and_phrases['phrases'].append(phrase_metadata)
        
    categories_and_phrases.append(category_and_phrases)

In [17]:
# Actual Script

categories_and_phrases = []
errors = []
success = []

for endpoint in category_endpoints:
    
    endpoint = endpoint
    soup = get_request_to_soup(endpoint, 2)
    category = None
    rows = None
    table = None
    tbody = None
    unit_tests = {
        'all endpoints have an h1 tag with text': None,
        'all endpoints have a table with a class of tablepress': None,
        'all endpoints with class = tablepress have a tbody': None,
        'all endpoints have a tbody with rows': None
    }
    unit_test_result = 'pass'
    
    #################################################################################
    # Unit Testing
    #################################################################################
    
    
    # Unit Test 1

    try:
        category = soup.find('h1', class_ ='page-title').text
        unit_tests['all endpoints have an h1 tag with text'] = 'pass'
    except:
        unit_tests['all endpoints have an h1 tag with text'] = 'fail'
        errors.append(endpoint)
    
    # Unit Test 2

    try:
        table = soup.find(class_='tablepress')
        unit_tests['all endpoints have a table with a class of tablepress'] = 'pass'
        
    except:
        unit_tests['all endpoints have a table with a class of tablepress'] = 'fail'
        errors.append(endpoint)        
    
    # Unit Test 3

    try:
        tbody = table.tbody
        unit_tests['all endpoints with class = tablepress have a tbody'] = 'pass'
        
    except:
        unit_tests['all endpoints with class = tablepress have a tbody'] = 'fail'
        errors.append(endpoint)
        
    # Unit Test 4

    try:
        rows = tbody.findAll('tr')
        unit_tests['all endpoints have a tbody with rows'] = 'pass'
        
    except:
        unit_tests['all endpoints have a tbody with rows'] = 'fail'
        errors.append(endpoint)
    
    # Checking if all the unit tests passed
    
    for result in unit_tests.values():
        if (result == 'fail'):
            unit_test_result = 'fail'
            
    if (unit_test_result == 'pass'):
        category_and_phrases = {
            'category': category,
            'phrases': []
        }

        #################################################################################
        # Iterating through the rows
        #################################################################################    

        for row in rows:

            tds = row.findAll('td')
            phrase = None
            number_of_words = None
            total_number_of_letters = None
            first_word_letters = None

            try:
                phrase = tds[0].text
            except:
                phrase = np.nan

            try:
                number_of_words = tds[1].text
            except:
                number_of_words = np.nan

            try:
                total_number_of_letters = tds[2].text
            except:
                total_number_of_letters = np.nan

            try:
                first_word_letters = tds[3].text
            except:
                first_word_letters = np.nan

            phrase_metadata = {
                'category': category,
                'phrase': phrase,
                'number_of_words': number_of_words,
                'total_number_of_letters': total_number_of_letters,
                'first_word_letters': first_word_letters            
            }

            category_and_phrases['phrases'].append(phrase_metadata)

        categories_and_phrases.append(category_and_phrases)

In [18]:
print(unit_tests['all endpoints have an h1 tag with text'])
print(unit_tests['all endpoints have a table with a class of tablepress'])
print(unit_tests['all endpoints with class = tablepress have a tbody'])
print(unit_tests['all endpoints have a tbody with rows'])

pass
pass
pass
pass


In [49]:
data = categories_and_phrases
combined_phrases = []

for category in categories_and_phrases:
    for phrase in category['phrases']:
        combined_phrases.append(phrase)

In [53]:
df = pd.DataFrame(combined_phrases)
df_renamed = df.rename(columns={'category':'Category', 'first_word_letters':'First Word Letters', 'number_of_words':'Number of Words', 'phrase':'Phrase', 'total_number_of_letters':'Total Number of Letters'})

In [56]:
# Checking if every column contained a value

df.count()

category                   75463
first_word_letters         75463
number_of_words            75463
phrase                     75463
total_number_of_letters    75463
dtype: int64

In [57]:
df_renamed.to_excel('categories_and_phrases.xlsx', index=False, header=True)