I want to write a script that takes a list of Wikipedia category pages for record producers and adds all of the featurized audio data and metadata into a mongodb.

In [1]:
# Standard Imports

import numpy as np
import pandas as pd
import os
import sys
from collections import defaultdict
from importlib import reload
from bs4 import BeautifulSoup
import requests

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Load MongoDB

from pymongo import MongoClient
client = MongoClient()
# Access/Initiate Database
db = client['producer_db']
# Access/Initiate Table
tab = db['songs']

In [3]:
# Authorize Spotify API

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_id = os.environ['SPOTIFY_CLIENT_ID']
client_secret = os.environ['SPOTIFY_CLIENT_SECRET']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
# Import Custom Webscraping Modules

import src.wiki_scraping as wiki_scraping

In [10]:
reload(wiki_scraping)

<module 'src.wiki_scraping' from '/Users/Maxwell/galvanize/production-value/src/wiki_scraping.py'>

In [11]:
cat_url_list = ['https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_George_Martin',
                'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_Dr._Dre',
                'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_Rick_Rubin',
                'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_Brian_Eno',
                'https://en.wikipedia.org/wiki/Category:Song_recordings_produced_by_Stock_Aitken_Waterman']

collection = db.tab

In [12]:
def load_producers(cat_url_list, collection, sp):
    
    for cat_url in cat_url_list:

        #Extract Producer Name
        html = requests.get(cat_url).content
        soup = BeautifulSoup(html, 'html.parser')
        producer = soup.find_all('h1', {'id':"firstHeading"})[0].text.split('by ')[-1]

        print('-'*20)
        print('PRODUCER: {}'.format(producer))
        print('-'*20)
        print()

        #Scrape Wikipedia Page for Songs and get spotify track id's
        print('Scraping Wikipedia')
        spotify_info = wiki_scraping.get_spotify_info_from_wiki(cat_url, sp)

        print('Example data:')
        for i in range(5):
            print(spotify_info[i])
        print()

        print('Extracting Audio Analysis...')
        print()

        idx_list = []

        #Use SpotiPy to access song featurized data
        for track, artist, album, song_id, spotify_track, spotify_artist in spotify_info:
            print('Importing {} by {}...'.format(track,artist))
            query = 'track:{} artist:{}'.format(track,artist)
            result = sp.search(q=query, type='track')
            song_id = result['tracks']['items'][0]['id']
            song_info = sp.track(song_id)
            song_analysis = sp.audio_analysis(song_id)
            song_features = sp.audio_features(song_id)


            #Add featurized data to MongoDB
            new_entry = {'track':track,
                         'artist':artist,
                         'album':album,
                         'producer':producer,
                         'spotify_id':song_id,
                         'track_info':song_info,
                         'audio_analysis':song_analysis,
                         'audio_features':song_features}

            idx = collection.insert_one(new_entry)
            idx_list.append(idx)

            print('Import Complete.')
            print()
            
    return idx_list

In [13]:
idx_list = load_producers(cat_url_list, collection, sp)

--------------------
PRODUCER: George Martin
--------------------

Scraping Wikipedia
Scraping Wikipedia

Querying Spotify API

Example data:
('12-Bar Original', 'the Beatles', 'Anthology 2', '2HvTGx5fzFGpHSyRNvXd9T', '12 Bar Original - Anthology 2 Version', 'The Beatles')
('Across the Universe', 'the Beatles', "No One's Gonna Change Our World", '4dkoqJrP0L8FXftrMZongF', 'Across The Universe - Remastered 2009', 'The Beatles')
('Act Naturally', 'Buck Owens and the Buckaroos', 'The Best of Buck Owens', '2LClPTK0FNl4AnOfKUJBQw', 'Act Naturally (Live)', 'Buck Owens & The Buckaroos')
('Alfie', 'Cilla Black', 'Cilla Black singles chronology', '2IqtBxwRgNOt7YWMmulrUZ', 'Alfie - 2003 Remaster', 'Cilla Black')
("All I've Got to Do", 'the Beatles', 'With the Beatles', '5tztLBvTlNC15Np2tnQ5Ll', "All I've Got To Do - Remastered 2009", 'The Beatles')

Extracting Audio Analysis...

Importing 12-Bar Original by the Beatles...
Import Complete.

Importing Across the Universe by the Beatles...
Import Co

In [21]:
collection.count_documents(filter={})

467

In [29]:
list(collection.aggregate(
   [{"$group" : { "_id" : '$producer', "count" : {"$sum" : 1}}}]
))

[{'_id': 'Stock Aitken Waterman', 'count': 62},
 {'_id': 'George Martin', 'count': 130},
 {'_id': 'Dr. Dre', 'count': 43},
 {'_id': 'Rick Rubin', 'count': 155},
 {'_id': 'Brian Eno', 'count': 77}]

It looks like a good amount of Dr. Dre and SAW songs are missing... I wonder why...

In [30]:
idx_list = load_producers(['https://en.wikipedia.org/w/index.php?title=Category:Song_recordings_produced_by_George_Martin&pagefrom=Ob-La-Di%2C+Ob-La-Da#mw-pages'], collection, sp)

--------------------
PRODUCER: George Martin
--------------------

Scraping Wikipedia
Scraping Wikipedia

Querying Spotify API

Example data:
('Ob-La-Di, Ob-La-Da', 'the Beatles', 'The Beatles', '1gFNm7cXfG1vSMcxPpSxec', 'Ob-La-Di, Ob-La-Da - Remastered 2009', 'The Beatles')
("Octopus's Garden", 'the Beatles', 'Abbey Road', '47Yv8Plq2mBXwdGq4PVFDR', "Octopus' Garden", 'The Beatles Revival Band & Orchestra')
('Oh! Darling', 'the Beatles', 'Abbey Road', '2mxByJWOajjiVsLWjNXvDJ', 'Oh! Darling - Remastered 2009', 'The Beatles')
('Only a Northern Song', 'the Beatles', 'Yellow Submarine', '4phXWLTSlC1BEogSWhJMDD', 'Only A Northern Song - Remastered 2009', 'The Beatles')
('P.S. I Love You', 'the Beatles', 'Please Please Me', '7Msq9qojB7yORuJvz49iUy', 'P.S. I Love You - Remastered 2009', 'The Beatles')

Extracting Audio Analysis...

Importing Ob-La-Di, Ob-La-Da by the Beatles...
Import Complete.

Importing Octopus's Garden by the Beatles...
Import Complete.

Importing Oh! Darling by the Beatle