# Load Libraries

In [1]:
import pandas as pd
import os
import h5py
import glob
import hdf5_getters
from bs4 import BeautifulSoup
import requests

import functions

# 1 MillionSongSubset Dataset

The first datset we will download is the [MillionSongSubset dataset](http://millionsongdataset.com/pages/getting-dataset/)

In [2]:
# Define function to get all song titles and artists

def get_all_songs(basedir,ext='.h5') :
    titles = []
    artists = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            titles.append( hdf5_getters.get_title(h5) )
            artists.append( hdf5_getters.get_artist_name(h5) )
            h5.close()
    return titles, artists

In [3]:
# Call function to collect all titles and artists
all_titles, all_artists = get_all_songs('./MillionSongSubset', 'h5')

In [4]:
# Create a DataFrame with all titles and artists
mss_df = pd.DataFrame({'title': all_titles, 'artist': all_artists})

# Show first 5 rows & shape
print(mss_df.shape)
mss_df.head()

(10000, 2)


Unnamed: 0,title,artist
0,b'Je Sais Que La Terre Est Plate',b'Rapha\xc3\xabl'
1,b'On Efface',b'Julie Zenatti'
2,b'Howells Delight',b'The Baltimore Consort'
3,b'Martha Served',b'I Hate Sally'
4,b'Zip-A-Dee-Doo-Dah (Song of the South)',b'Orlando Pops Orchestra'


In [5]:
# Define column type as string
mss_df['title'] = mss_df['title'].astype(str)
mss_df['artist'] = mss_df['artist'].astype(str)

# Clean titles and artists by replacing extra characters: "'" and "b"
mss_df['title'] = mss_df['title'].str.replace('b|\'', '', regex = True)
mss_df['artist'] = mss_df['artist'].str.replace('b|\'', '', regex = True)

# Show first 5 rows
mss_df.head()

Unnamed: 0,title,artist
0,Je Sais Que La Terre Est Plate,Rapha\xc3\xal
1,On Efface,Julie Zenatti
2,Howells Delight,The Baltimore Consort
3,Martha Served,I Hate Sally
4,Zip-A-Dee-Doo-Dah (Song of the South),Orlando Pops Orchestra


### Remove duplicate rows from Hot 100 Songs

In [19]:
# Load Hot 100 songs dataset
hot_songs_df = functions.scrape_hot100()

# Merge the two DataFrames based on their columns to identify the common rows
merged_df = pd.merge(mss_df, hot_songs_df, indicator=True, how='outer')

# Filter out the rows that are present in both DataFrames
mss_filtered_df = merged_df[merged_df['_merge'] == 'left_only']

# Check if any rows were removed
display(mss_filtered_df.shape)

# Drop the _merge column
mss_clean_df = mss_filtered_df.drop(columns='_merge')

# Show clean df
mss_clean_df.head()

(10000, 3)

Unnamed: 0,title,artist
0,Je Sais Que La Terre Est Plate,Rapha\xc3\xal
1,On Efface,Julie Zenatti
2,Howells Delight,The Baltimore Consort
3,Martha Served,I Hate Sally
4,Zip-A-Dee-Doo-Dah (Song of the South),Orlando Pops Orchestra


In [20]:
# Save data as csv
mss_clean_df.to_csv('million_song_subset.csv',index = False, sep=";")

# 2 Wikipedia Web Scraping Dataset (WIP)

In [None]:
artists_url = 'https://en.wikipedia.org/wiki/Lists_of_songs'
songs_url = 'https://en.wikipedia.org/wiki/List_of_songs_recorded_by_{}'

artists = []

In [None]:
response = requests.get(artists_url)
response.status_code 

In [None]:
soup = BeautifulSoup(response.content, "html.parser")


artist_divs = soup.select('#mw-content-text > div')

print(len(artist_divs))

for index, div in enumerate(artist_divs):
    print('INDEX:', index, div)