In [3]:
import numpy as np
import pandas as pd
import os
import sys
from collections import defaultdict
from importlib import reload
from bs4 import BeautifulSoup
import requests

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Getting Track-Artist-Album-Producer Data from a Wikipedia "List of Songs Produced by PERSON-X Page"

## Step 1: Retrieving a list of URLs from a Wikipedia list page

We want to scrape all of the URLs for the Wikipedia pages on songs produced by George Martin from this aggregate page: https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin. We locate the relevant links and return a list of URL strings.

In [8]:
requests.get('https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin').content.decode()[:1000]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Category:Song recordings produced by George Martin - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"Category","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":14,"wgPageName":"Category:Song_recordings_produced_by_George_Martin","wgTitle":"Song recordings produced by George Martin","wgCurRevisionId":577742163,"wgRevisionId":577742163,"wgArticleId":38853525,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["George Martin","Song recordings by producer"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaul

In [10]:
html = requests.get('https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin').content
soup = BeautifulSoup(html, 'html.parser')

In [27]:
#select the url for one link
soup.select('li')[3].select('a')[0]['href']

'/wiki/Act_Naturally'

In [28]:
#The full URL is https://en.wikipedia.org/wiki/Act_Naturally

url_test = 'https://en.wikipedia.org' + soup.select('li')[3].select('a')[0]['href']
url_test

'https://en.wikipedia.org/wiki/Act_Naturally'

The full list is in a `<div>` section where `class = "mw-category"`

In [33]:
martin_urls = []
domain = 'https://en.wikipedia.org'
links = soup.find_all('div', class_="mw-category")[0].find_all('a')
for link in links:
    path = link['href']
    url = domain + path
    martin_urls.append(url)
    
    
#NEED TO ADD FUNCTIONALITY FOR MULTIPLE PAGES

In [35]:
martin_urls[:6]

['https://en.wikipedia.org/wiki/12-Bar_Original',
 'https://en.wikipedia.org/wiki/Across_the_Universe',
 'https://en.wikipedia.org/wiki/Act_Naturally',
 'https://en.wikipedia.org/wiki/Alfie_(Burt_Bacharach_song)',
 'https://en.wikipedia.org/wiki/All_I%27ve_Got_to_Do',
 'https://en.wikipedia.org/wiki/All_My_Loving']

Success! Now on to...

## Step 2: Retrieving Track - Artist - Album - Producer from a Song's Wikipedia Page

In [36]:
song_url = 'https://en.wikipedia.org/wiki/Across_the_Universe'

html_song = requests.get(song_url).content

soup_song = BeautifulSoup(html_song, 'html.parser')

track = soup_song.find_all(
    'table', class_='infobox vevent')[0].find_all(
    'th', class_='summary')[0].text.replace('"','')

artist = soup_song.find_all(
    'table', class_='infobox vevent')[0].find_all(
    'th', class_='description')[0].text.replace('Song by ','')

album = soup_song.find_all(
    'table', class_='infobox vevent')[0].find_all(
    'th', class_='description')[1].text.replace('from the album ','')

track, artist, album

#Might need to add producer later

In [69]:
def get_wiki_song_info(song_url):
    """
    Extract Track title, Artist Name, and Album Name from a Wikipedia entry for a song.
    
    INPUT:
    song_url: STR - url for wikipedia entry for a song
    
    OUTPUT:
    song_info: TUPLE of STR - (track, artist, album)
    """
    
    html_song = requests.get(song_url).content

    soup_song = BeautifulSoup(html_song, 'html.parser')

    track = soup_song.find_all(
        'table', class_='infobox vevent')[0].find_all(
        'th', class_='summary')[0].text.replace('"','')

    artist = soup_song.find_all(
        'table', class_='infobox vevent')[0].find_all(
        'th', class_='description')[0].text.split('by ')[-1]

    album = soup_song.find_all(
        'table', class_='infobox vevent')[0].find_all(
        'th', class_='description')[1].text.replace('from the album ','')

    return track, artist, album


In [70]:
get_wiki_song_info('https://en.wikipedia.org/wiki/Because_(Beatles_song)')

('Because', 'the Beatles', 'Abbey Road')

In [71]:
get_wiki_song_info('https://en.wikipedia.org/wiki/Hail_Caesar_(song)')

('Hail Caesar', 'AC/DC', 'Ballbreaker')

Great! Next...

## Step 3: Make a list of (track, artist, album) tuples for one producer.

In [76]:
martin_song_info = []
for song_url in martin_urls:
    print(song_url)
    try:
        song_info = get_wiki_song_info(song_url)
        martin_song_info.append(song_info)
    except:
        pass
martin_song_info

https://en.wikipedia.org/wiki/12-Bar_Original
https://en.wikipedia.org/wiki/Across_the_Universe
https://en.wikipedia.org/wiki/Act_Naturally
https://en.wikipedia.org/wiki/Alfie_(Burt_Bacharach_song)
https://en.wikipedia.org/wiki/All_I%27ve_Got_to_Do
https://en.wikipedia.org/wiki/All_My_Loving
https://en.wikipedia.org/wiki/All_Together_Now
https://en.wikipedia.org/wiki/All_You_Need_Is_Love
https://en.wikipedia.org/wiki/Amber_Cascades
https://en.wikipedia.org/wiki/And_I_Love_Her
https://en.wikipedia.org/wiki/And_Your_Bird_Can_Sing
https://en.wikipedia.org/wiki/Anna_(Go_to_Him)
https://en.wikipedia.org/wiki/Another_Girl
https://en.wikipedia.org/wiki/Any_Time_at_All
https://en.wikipedia.org/wiki/Anyone_Who_Had_a_Heart_(song)
https://en.wikipedia.org/wiki/Ask_Me_Why
https://en.wikipedia.org/wiki/Average_Person
https://en.wikipedia.org/wiki/Baby_It%27s_You
https://en.wikipedia.org/wiki/Baby,_You%27re_a_Rich_Man
https://en.wikipedia.org/wiki/Baby%27s_in_Black
https://en.wikipedia.org/wiki/Back

[('12-Bar Original', 'the Beatles', 'Anthology 2'),
 ('Across the Universe', 'the Beatles', "No One's Gonna Change Our World"),
 ('Act Naturally', 'Buck Owens and the Buckaroos', 'The Best of Buck Owens'),
 ('Alfie', 'Cilla Black', 'Cilla Black singles chronology'),
 ("All I've Got to Do", 'the Beatles', 'With the Beatles'),
 ('All My Loving', 'the Beatles', 'With the Beatles'),
 ('All Together Now', 'the Beatles', 'Yellow Submarine'),
 ('All You Need Is Love', 'the Beatles', 'The Beatles singles chronology'),
 ('Amber Cascades', 'America', 'Hideaway'),
 ('And I Love Her', 'the Beatles', "A Hard Day's Night"),
 ('And Your Bird Can Sing', 'the Beatles', 'Revolver'),
 ('Anna (Go to Him)',
  'Arthur Alexander',
  'Arthur Alexander singles chronology'),
 ('Another Girl', 'the Beatles', 'Help!'),
 ('Any Time at All', 'the Beatles', "A Hard Day's Night"),
 ('Anyone Who Had a Heart', 'Dionne Warwick', 'Anyone Who Had a Heart'),
 ('Ask Me Why', 'the Beatles', 'Please Please Me'),
 ("Baby It's 

Finally, our last step:

## Step 4: Call Spotify API to get audio data on every (track,artist,album) tuple.