In [3]:
import numpy as np
import pandas as pd
import os
import sys
from collections import defaultdict
from importlib import reload
from bs4 import BeautifulSoup
import requests

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Getting Track-Artist-Album-Producer Data from a Wikipedia "List of Songs Produced by PERSON-X Page"

## Step 1: Retrieving a list of URLs from a Wikipedia list page

We want to scrape all of the URLs for the Wikipedia pages on songs produced by George Martin from this aggregate page: https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin. We locate the relevant links and return a list of URL strings.

In [8]:
requests.get('https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin').content.decode()[:1000]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Category:Song recordings produced by George Martin - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"Category","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":14,"wgPageName":"Category:Song_recordings_produced_by_George_Martin","wgTitle":"Song recordings produced by George Martin","wgCurRevisionId":577742163,"wgRevisionId":577742163,"wgArticleId":38853525,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["George Martin","Song recordings by producer"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaul

In [10]:
html = requests.get('https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin').content
soup = BeautifulSoup(html, 'html.parser')

In [27]:
#select the url for one link
soup.select('li')[3].select('a')[0]['href']

'/wiki/Act_Naturally'

In [28]:
#The full URL is https://en.wikipedia.org/wiki/Act_Naturally

url_test = 'https://en.wikipedia.org' + soup.select('li')[3].select('a')[0]['href']
url_test

'https://en.wikipedia.org/wiki/Act_Naturally'

The full list is in a `<div>` section where `class = "mw-category"`

In [33]:
martin_urls = []
domain = 'https://en.wikipedia.org'
links = soup.find_all('div', class_="mw-category")[0].find_all('a')
for link in links:
    path = link['href']
    url = domain + path
    martin_urls.append(url)
    
    
#NEED TO ADD FUNCTIONALITY FOR MULTIPLE PAGES

In [35]:
martin_urls[:6]

['https://en.wikipedia.org/wiki/12-Bar_Original',
 'https://en.wikipedia.org/wiki/Across_the_Universe',
 'https://en.wikipedia.org/wiki/Act_Naturally',
 'https://en.wikipedia.org/wiki/Alfie_(Burt_Bacharach_song)',
 'https://en.wikipedia.org/wiki/All_I%27ve_Got_to_Do',
 'https://en.wikipedia.org/wiki/All_My_Loving']

Success! Now on to...

## Step 2: Retrieving Track - Artist - Album - Producer from a Song's Wikipedia Page

In [36]:
song_url = 'https://en.wikipedia.org/wiki/Across_the_Universe'

html_song = requests.get(song_url).content

soup_song = BeautifulSoup(html_song, 'html.parser')

track = soup_song.find_all(
    'table', class_='infobox vevent')[0].find_all(
    'th', class_='summary')[0].text.replace('"','')

artist = soup_song.find_all(
    'table', class_='infobox vevent')[0].find_all(
    'th', class_='description')[0].text.replace('Song by ','')

album = soup_song.find_all(
    'table', class_='infobox vevent')[0].find_all(
    'th', class_='description')[1].text.replace('from the album ','')

track, artist, album

#Might need to add producer later

In [69]:
def get_wiki_song_info(song_url):
    """
    Extract Track title, Artist Name, and Album Name from a Wikipedia entry for a song.
    
    INPUT:
    song_url: STR - url for wikipedia entry for a song
    
    OUTPUT:
    song_info: TUPLE of STR - (track, artist, album)
    """
    
    html_song = requests.get(song_url).content

    soup_song = BeautifulSoup(html_song, 'html.parser')

    track = soup_song.find_all(
        'table', class_='infobox vevent')[0].find_all(
        'th', class_='summary')[0].text.replace('"','')

    artist = soup_song.find_all(
        'table', class_='infobox vevent')[0].find_all(
        'th', class_='description')[0].text.split('by ')[-1]

    album = soup_song.find_all(
        'table', class_='infobox vevent')[0].find_all(
        'th', class_='description')[1].text.replace('from the album ','')

    return track, artist, album


In [70]:
get_wiki_song_info('https://en.wikipedia.org/wiki/Because_(Beatles_song)')

('Because', 'the Beatles', 'Abbey Road')

In [71]:
get_wiki_song_info('https://en.wikipedia.org/wiki/Hail_Caesar_(song)')

('Hail Caesar', 'AC/DC', 'Ballbreaker')

Great! Next...

## Step 3: Make a list of (track, artist, album) tuples for one producer.

In [76]:
martin_song_info = []
for song_url in martin_urls:
    print(song_url)
    try:
        song_info = get_wiki_song_info(song_url)
        martin_song_info.append(song_info)
    except:
        pass
martin_song_info

https://en.wikipedia.org/wiki/12-Bar_Original
https://en.wikipedia.org/wiki/Across_the_Universe
https://en.wikipedia.org/wiki/Act_Naturally
https://en.wikipedia.org/wiki/Alfie_(Burt_Bacharach_song)
https://en.wikipedia.org/wiki/All_I%27ve_Got_to_Do
https://en.wikipedia.org/wiki/All_My_Loving
https://en.wikipedia.org/wiki/All_Together_Now
https://en.wikipedia.org/wiki/All_You_Need_Is_Love
https://en.wikipedia.org/wiki/Amber_Cascades
https://en.wikipedia.org/wiki/And_I_Love_Her
https://en.wikipedia.org/wiki/And_Your_Bird_Can_Sing
https://en.wikipedia.org/wiki/Anna_(Go_to_Him)
https://en.wikipedia.org/wiki/Another_Girl
https://en.wikipedia.org/wiki/Any_Time_at_All
https://en.wikipedia.org/wiki/Anyone_Who_Had_a_Heart_(song)
https://en.wikipedia.org/wiki/Ask_Me_Why
https://en.wikipedia.org/wiki/Average_Person
https://en.wikipedia.org/wiki/Baby_It%27s_You
https://en.wikipedia.org/wiki/Baby,_You%27re_a_Rich_Man
https://en.wikipedia.org/wiki/Baby%27s_in_Black
https://en.wikipedia.org/wiki/Back

[('12-Bar Original', 'the Beatles', 'Anthology 2'),
 ('Across the Universe', 'the Beatles', "No One's Gonna Change Our World"),
 ('Act Naturally', 'Buck Owens and the Buckaroos', 'The Best of Buck Owens'),
 ('Alfie', 'Cilla Black', 'Cilla Black singles chronology'),
 ("All I've Got to Do", 'the Beatles', 'With the Beatles'),
 ('All My Loving', 'the Beatles', 'With the Beatles'),
 ('All Together Now', 'the Beatles', 'Yellow Submarine'),
 ('All You Need Is Love', 'the Beatles', 'The Beatles singles chronology'),
 ('Amber Cascades', 'America', 'Hideaway'),
 ('And I Love Her', 'the Beatles', "A Hard Day's Night"),
 ('And Your Bird Can Sing', 'the Beatles', 'Revolver'),
 ('Anna (Go to Him)',
  'Arthur Alexander',
  'Arthur Alexander singles chronology'),
 ('Another Girl', 'the Beatles', 'Help!'),
 ('Any Time at All', 'the Beatles', "A Hard Day's Night"),
 ('Anyone Who Had a Heart', 'Dionne Warwick', 'Anyone Who Had a Heart'),
 ('Ask Me Why', 'the Beatles', 'Please Please Me'),
 ("Baby It's 

Let's make a function now that takes a "List of songs by person X" wikipedia page and returns a list of track-artist-album tuples.

In [77]:
def get_wiki_from_category(category_url):
    """
    Returns a list of (track, artist, album) tuples for every song listed in a Wikipedia Category page such as 
    'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin'
    
    INPUTS:
        category_url: STR - path to wikipedia category page
        
    OUTPUTS:
        song_info_list: LIST of TUPLES of STRINGS - LIST of (track, artist, album) TUPLES for every song linked in a category page.
    
    """
    
    song_urls = []
    domain = 'https://en.wikipedia.org'
    
    html_cat = requests.get(category_url).content
    soup_cat = BeautifulSoup(html_cat, 'html.parser')
    
    song_links = soup.find_all('div', class_="mw-category")[0].find_all('a')
    for link in song_links:
        path = link['href']
        url = domain + path
        song_urls.append(url)
           
        #NEED TO ADD FUNCTIONALITY FOR MULTIPLE PAGES
        
    song_info_list = []
    
    for song_url in song_urls:
        try:
            song_info = get_wiki_song_info(song_url)
            song_info_list.append(song_info)
        except:
            pass
        
    return song_info_list
    

In [78]:
cat_url = 'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin'

song_info_list = get_wiki_from_category(cat_url)

In [79]:
song_info_list[:10]

[('12-Bar Original', 'the Beatles', 'Anthology 2'),
 ('Across the Universe', 'the Beatles', "No One's Gonna Change Our World"),
 ('Act Naturally', 'Buck Owens and the Buckaroos', 'The Best of Buck Owens'),
 ('Alfie', 'Cilla Black', 'Cilla Black singles chronology'),
 ("All I've Got to Do", 'the Beatles', 'With the Beatles'),
 ('All My Loving', 'the Beatles', 'With the Beatles'),
 ('All Together Now', 'the Beatles', 'Yellow Submarine'),
 ('All You Need Is Love', 'the Beatles', 'The Beatles singles chronology'),
 ('Amber Cascades', 'America', 'Hideaway'),
 ('And I Love Her', 'the Beatles', "A Hard Day's Night")]

Finally, our last step:

## Step 4: Call Spotify API to get audio data on every (track,artist,album) tuple.

In [81]:
# Initialize Spotify Credentials

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_id = os.environ['SPOTIFY_CLIENT_ID']
client_secret = os.environ['SPOTIFY_CLIENT_SECRET']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [117]:
results = sp.search(q='track:Across the Universe artist:the Beatles', type='track')
song_id = results['tracks']['items'][0]['id']

In [118]:
results

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3AAcross+the+Universe+artist%3Athe+Beatles&type=track&offset=0&limit=10',
  'items': [{'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3WrFJ7ztbogyGnTHbHJFl2'},
       'href': 'https://api.spotify.com/v1/artists/3WrFJ7ztbogyGnTHbHJFl2',
       'id': '3WrFJ7ztbogyGnTHbHJFl2',
       'name': 'The Beatles',
       'type': 'artist',
       'uri': 'spotify:artist:3WrFJ7ztbogyGnTHbHJFl2'}],
     'available_markets': ['AD',
      'AE',
      'AR',
      'AT',
      'AU',
      'BE',
      'BG',
      'BH',
      'BO',
      'BR',
      'CA',
      'CH',
      'CL',
      'CO',
      'CR',
      'CY',
      'CZ',
      'DE',
      'DK',
      'DO',
      'DZ',
      'EC',
      'EE',
      'EG',
      'ES',
      'FI',
      'FR',
      'GB',
      'GR',
      'GT',
      'HK',
      'HN',
      'HU',
      'ID',
      'IE',
      'IL',
      'IN',
      'IS',
   

In [90]:
analysis = sp.audio_analysis(song_id)
analysis

{'meta': {'analyzer_version': '4.0.0',
  'platform': 'Linux',
  'detailed_status': 'OK',
  'status_code': 0,
  'timestamp': 1553384392,
  'analysis_time': 6.37076,
  'input_process': 'libvorbisfile L+R 44100->22050'},
 'track': {'num_samples': 5030340,
  'duration': 228.13333,
  'sample_md5': '',
  'offset_seconds': 0,
  'window_seconds': 0,
  'analysis_sample_rate': 22050,
  'analysis_channels': 1,
  'end_of_fade_in': 0.34834,
  'start_of_fade_out': 211.05778,
  'loudness': -11.788,
  'tempo': 152.126,
  'tempo_confidence': 0.457,
  'time_signature': 4,
  'time_signature_confidence': 0.847,
  'key': 1,
  'key_confidence': 0.838,
  'mode': 1,
  'mode_confidence': 0.648,
  'codestring': 'eJw1momV5TYMBFNRCLyP_BObquYf7_Pa5JdIEEejAaq3U8s6bX3lu3XuttsaX2_la6OVtfYY3-6LUR3lnN2_WvOjw9YLwzO-U8fdt5f21bnON3bfpa85vrrb-c7iT5_tfvXy8K61NPb6WufZO1lw88DH3-NbBYHqmPfr1ZXKnb331r8-8yuDWjsSXn6dzPFYa9_o_Mqwn733RAB-3WXfe277piJWRCoLaeYq5Tvj9nlvv99UpFpKrZu99rdGb98eyIR47VtrLoelnsl51v49XdasjHmIzRGh9nXZctX13T2QefLy

In [108]:
results['tracks']['items'][0]['name'], results['tracks']['items'][0]['artists'][0]['name']

('Across The Universe - Remastered 2009', 'The Beatles')

In [120]:
spotify_info = []

for track, artist, album in song_info_list:
#     print(track,artist,album)
    try:
        query = 'track:{} artist:{}'.format(track,artist)
#         print(query)
        results = sp.search(q=query, type='track')
        song_id = results['tracks']['items'][0]['id']
#         print(song_id)
        spotify_track = results['tracks']['items'][0]['name']
        spotify_artist = results['tracks']['items'][0]['artists'][0]['name']
        new_song_info = (track, artist, album, song_id, spotify_track, spotify_artist)
        spotify_info.append(new_song_info)
    except:
        pass
    
spotify_info

[('12-Bar Original',
  'the Beatles',
  'Anthology 2',
  '2HvTGx5fzFGpHSyRNvXd9T',
  '12 Bar Original - Anthology 2 Version',
  'The Beatles'),
 ('Across the Universe',
  'the Beatles',
  "No One's Gonna Change Our World",
  '4dkoqJrP0L8FXftrMZongF',
  'Across The Universe - Remastered 2009',
  'The Beatles'),
 ('Act Naturally',
  'Buck Owens and the Buckaroos',
  'The Best of Buck Owens',
  '2LClPTK0FNl4AnOfKUJBQw',
  'Act Naturally (Live)',
  'Buck Owens & The Buckaroos'),
 ('Alfie',
  'Cilla Black',
  'Cilla Black singles chronology',
  '2IqtBxwRgNOt7YWMmulrUZ',
  'Alfie - 2003 Remaster',
  'Cilla Black'),
 ("All I've Got to Do",
  'the Beatles',
  'With the Beatles',
  '5tztLBvTlNC15Np2tnQ5Ll',
  "All I've Got To Do - Remastered 2009",
  'The Beatles'),
 ('All My Loving',
  'the Beatles',
  'With the Beatles',
  '4joiWvli4qJVEW6qZV2i2J',
  'All My Loving - Remastered 2009',
  'The Beatles'),
 ('All Together Now',
  'the Beatles',
  'Yellow Submarine',
  '2ck8lFrYAch2GPtdhpTHe3',
  

In [133]:
def get_spotify_info_from_wiki(cat_url):
    song_info_list = get_wiki_from_category(cat_url)
    
    spotify_info = []

    for track, artist, album in song_info_list:
        try:
            query = 'track:{} artist:{}'.format(track,artist)
            results = sp.search(q=query, type='track')
            song_id = results['tracks']['items'][0]['id']
            spotify_track = results['tracks']['items'][0]['name']
            spotify_artist = results['tracks']['items'][0]['artists'][0]['name']
            new_song_info = (track, artist, album, song_id, spotify_track, spotify_artist)
            spotify_info.append(new_song_info)
        except:
            pass

    return spotify_info

In [135]:
spotify_info = get_spotify_info_from_wiki(cat_url)

In [136]:
spotify_info

[('12-Bar Original',
  'the Beatles',
  'Anthology 2',
  '2HvTGx5fzFGpHSyRNvXd9T',
  '12 Bar Original - Anthology 2 Version',
  'The Beatles'),
 ('Across the Universe',
  'the Beatles',
  "No One's Gonna Change Our World",
  '4dkoqJrP0L8FXftrMZongF',
  'Across The Universe - Remastered 2009',
  'The Beatles'),
 ('Act Naturally',
  'Buck Owens and the Buckaroos',
  'The Best of Buck Owens',
  '2LClPTK0FNl4AnOfKUJBQw',
  'Act Naturally (Live)',
  'Buck Owens & The Buckaroos'),
 ('Alfie',
  'Cilla Black',
  'Cilla Black singles chronology',
  '2IqtBxwRgNOt7YWMmulrUZ',
  'Alfie - 2003 Remaster',
  'Cilla Black'),
 ("All I've Got to Do",
  'the Beatles',
  'With the Beatles',
  '5tztLBvTlNC15Np2tnQ5Ll',
  "All I've Got To Do - Remastered 2009",
  'The Beatles'),
 ('All My Loving',
  'the Beatles',
  'With the Beatles',
  '4joiWvli4qJVEW6qZV2i2J',
  'All My Loving - Remastered 2009',
  'The Beatles'),
 ('All Together Now',
  'the Beatles',
  'Yellow Submarine',
  '2ck8lFrYAch2GPtdhpTHe3',
  

In [123]:
for track, artist, album, song_id, spotify_track, spotify_artist in spotify_info:
    print (track, '\t\t\t', artist)
    print (spotify_track,'\t\t\t', spotify_artist)
    print ('')

12-Bar Original 			 the Beatles
12 Bar Original - Anthology 2 Version 			 The Beatles

Across the Universe 			 the Beatles
Across The Universe - Remastered 2009 			 The Beatles

Act Naturally 			 Buck Owens and the Buckaroos
Act Naturally (Live) 			 Buck Owens & The Buckaroos

Alfie 			 Cilla Black
Alfie - 2003 Remaster 			 Cilla Black

All I've Got to Do 			 the Beatles
All I've Got To Do - Remastered 2009 			 The Beatles

All My Loving 			 the Beatles
All My Loving - Remastered 2009 			 The Beatles

All Together Now 			 the Beatles
All Together Now - Remastered 2009 			 The Beatles

All You Need Is Love 			 the Beatles
All You Need Is Love - Remastered 2009 			 The Beatles

Amber Cascades 			 America
Amber Cascades 			 America

And I Love Her 			 the Beatles
And I Love Her - Remastered 2009 			 The Beatles

And Your Bird Can Sing 			 the Beatles
And Your Bird Can Sing - Remastered 2009 			 The Beatles

Another Girl 			 the Beatles
Another Girl - Remastered 2009 			 The Beatles

Any T

## Creating an entry for a Mongo DB

We need to create an entry for a Mongo DB with the following features:

    track: The title of the track
    artist: The name of the artist
    album: The name of the album
    producer: Then name of the producer (OUR TARGET)
    audio_analysis: Spotify Audio Analysis
    audio_features: Spotify Audio Features

In [127]:
sp.track(song_id)

{'album': {'album_type': 'album',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3WrFJ7ztbogyGnTHbHJFl2'},
    'href': 'https://api.spotify.com/v1/artists/3WrFJ7ztbogyGnTHbHJFl2',
    'id': '3WrFJ7ztbogyGnTHbHJFl2',
    'name': 'The Beatles',
    'type': 'artist',
    'uri': 'spotify:artist:3WrFJ7ztbogyGnTHbHJFl2'}],
  'available_markets': ['AD',
   'AE',
   'AR',
   'AT',
   'AU',
   'BE',
   'BG',
   'BH',
   'BO',
   'BR',
   'CA',
   'CH',
   'CL',
   'CO',
   'CR',
   'CY',
   'CZ',
   'DE',
   'DK',
   'DO',
   'DZ',
   'EC',
   'EE',
   'EG',
   'ES',
   'FI',
   'FR',
   'GB',
   'GR',
   'GT',
   'HK',
   'HN',
   'HU',
   'ID',
   'IE',
   'IL',
   'IN',
   'IS',
   'IT',
   'JO',
   'JP',
   'KW',
   'LB',
   'LI',
   'LT',
   'LU',
   'LV',
   'MA',
   'MC',
   'MT',
   'MX',
   'MY',
   'NI',
   'NL',
   'NO',
   'NZ',
   'OM',
   'PA',
   'PE',
   'PH',
   'PL',
   'PS',
   'PT',
   'PY',
   'QA',
   'RO',
   'SA',
   'SE',
   'SG',
   'SK',


In [137]:
cat_url

'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin'

### Extra: Extract producer name from category heading on Wikipedia

In [144]:
url = 'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin'
html = requests.get('https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin').content
soup = BeautifulSoup(html, 'html.parser')
soup.find_all('h1', {'id':"firstHeading"})[0].text.split('by ')[-1]

'George Martin'