# Imports

In [1]:
import sys
import requests
import json
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm


root = '../..' 
sys.path.append(root)
from src.tools import paths
from src.tools import data_tools as dt

# Tools

In [2]:
def gen_id(wiki_url):
    ''' Generate an 8-digit id (int) for based on the unique wikipedia url'''
    return hash(wiki_url) % 10**8

def clean_text(string):
    ''' Clean common cell text from a <td> tag '''
    return string.replace('\xa0', ' ').replace('\n',' ').strip()

def missing_link(name, soup):
    ''' Find a hpyerlink for a given name '''
    if soup.find('a', text=name):
        return soup.find('a', text=name).attrs.get('href')
    else:
        return None

def get_hyper_list(td, soup):
    '''
        Extract names and URLs from horizontal lists of hyperlinks
        
        Inputs:
            td (bs4 tag)
    '''
    result_list = []
    if not td.find('ul'):
        return [{'name': td.text, 'url':None, 'order': 0}]
    else:
        result_list = []
        for i, li in enumerate(td.find_all('li')):
            name = li.text
            url = li.find('a').attrs.get('href') if li.find('a') else missing_link(name, soup)
            result_list.append({'name': name, 'url': url, 'order': i})
            
        return result_list
                
  

# Starting point
The List of Billboard Hot 100 top-ten singles

In [3]:
url_list_of_lists = 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles'
wiki_base_url = 'http://en.wikipedia.org'

In [4]:
soup = BeautifulSoup(requests.get(url_list_of_lists).content, 'html.parser')
table = soup.select('#Top-ten_singles_by_year')[0].parent.next_sibling.next_sibling
a_tags = table.findAll('a')

In [5]:
year_links = [wiki_base_url + link.attrs['href'] for link in a_tags]
year_links

['http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1958',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1959',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1960',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1961',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1962',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1963',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1964',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1965',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1966',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1967',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1968',
 'http://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1969',
 'ht

# Year page

In [6]:
def pull_year_page_table(yp_url, out=True, write_path='../../data/wiki-top-10/'):
    
    def tidy_cell(cell, col=None, year=None):
        cell_text = clean_text(cell.text)
        if col in ['entry_date', 'peak_date']:
            cell_text += f' {year}'
        return cell_text

    # Get year from url
    year = yp_url.split('_')[-1]
    
    # Get html from wikipedia
    soup = BeautifulSoup(requests.get(yp_url).content, 'html.parser')
    
    # Assuming it's always the first table
    if not soup.select('table.wikitable'):
        return
    table = soup.select('table.wikitable')[0]
    n_cols = len(table.find('tr').find_all('th'))

    rowspan_fix = []
    output_rows=[]
    
    for i, row in enumerate(table.find_all('tr')):
        output_row = []
        
        for j, cell in enumerate(row.find_all('td')):
            rowspan = cell.attrs.get('rowspan')
            if rowspan:
                rowspan_fix += [(k,j, cell) for k in range(i+1, i+int(rowspan))]

            output_row.append(cell)
        output_rows.append(output_row)

    for (i,j,cell) in rowspan_fix:
        output_rows[i].insert(j, cell)

    # Build table of tag elelement cells
    df = pd.DataFrame(output_rows)
    df = df[df.columns[:6]].copy() #Drop ref column if it's there
    df.columns = ['entry_date', 'single', 'artist', 'peak', 'peak_date', 'weeks_in_top']
    df.dropna(inplace=True)


    # Tidy up cells, extract data
    for col in [x for x in df.columns if x!='single']:
        df[col] = df[col].apply(lambda x: tidy_cell(x, col, year))

    df['single_url'] = df.single.apply(lambda x: x.find('a').attrs['href']  if x.find('a') else None)
    df.single = df.single.apply(lambda x: tidy_cell(x))
    
    # Write table
    if out:
        df.to_csv(f'{write_path}{year}.csv',index=False)
    return df

In [7]:
for yl in year_links:
    pull_year_page_table(yl)

In [9]:
df = pull_year_page_table(year_links[-2])
df

Unnamed: 0,entry_date,single,artist,peak,peak_date,weeks_in_top,single_url
2,December 30(2017) 2019,"""All I Want For Christmas Is You""[U]",Mariah Carey,1,December 21 2019,10,/wiki/All_I_Want_For_Christmas_Is_You
3,October 20 2019,"""Shallow""[C]",Lady Gaga and Bradley Cooper,1,March 9 2019,7,/wiki/Shallow_(Lady_Gaga_and_Bradley_Cooper_song)
4,October 20 2019,"""Happier"" (#6)",Marshmello and Bastille,2,February 16 2019,27,/wiki/Happier_(Marshmello_and_Bastille_song)
5,November 3 2019,"""Sunflower"" (#2)[J][Q] ◁",Post Malone and Swae Lee,1,January 19 2019,33,/wiki/Sunflower_(Post_Malone_and_Swae_Lee_song)
6,November 10 2019,"""Without Me"" (#3)",Halsey,1,January 12 2019,29,/wiki/Without_Me_(Halsey_song)
7,November 24 2019,"""High Hopes""",Panic! at the Disco,4,January 26 2019,15,/wiki/High_Hopes_(Panic!_at_the_Disco_song)
9,January 5 2019,"""Rockin' Around the Christmas Tree""[U]",Brenda Lee,2,December 28 2019,5,/wiki/Rockin%27_Around_the_Christmas_Tree
10,January 19 2019,"""Eastside""[B]","Benny Blanco, Halsey and Khalid",9,January 19 2019,2,/wiki/Eastside_(song)
11,January 26 2019,"""Wow"" (#5)",Post Malone,2,April 6 2019,24,/wiki/Wow_(Post_Malone_song)
12,February 2 2019,"""7 Rings"" (#7) ◁",Ariana Grande,1,February 2 2019,16,/wiki/7_Rings


# Song page

In [10]:
test_url = df.single_url.iloc[:5]
test_url

2                /wiki/All_I_Want_For_Christmas_Is_You
3    /wiki/Shallow_(Lady_Gaga_and_Bradley_Cooper_song)
4         /wiki/Happier_(Marshmello_and_Bastille_song)
5      /wiki/Sunflower_(Post_Malone_and_Swae_Lee_song)
6                       /wiki/Without_Me_(Halsey_song)
Name: single_url, dtype: object

In [12]:
def pull_song_page_info(sp_url, out=True, write_path=paths.DATA_PATH/'songs'):
    '''
        Pull all information about a song from a wikipedia 'song page'
        Inputs:
            - sp_url (str): a song page url, a relative url (/wiki/...) to a song page
            - out (bool): whether to output the file
            - write_path (str): the path to write the file to if out is True
        Outputs:
            Dictionary of song data
    '''
    try:
        # Generate id
        song_id = gen_song_id(sp_url)

        # Get page html
        soup = BeautifulSoup(requests.get(wiki_base_url + sp_url).content, 'html.parser')
        table = soup.select('table.infobox')[0]

        song_data = {'id': song_id}
        song_data['title'] = table.find('tr').text.strip('"')
        song_data['url'] = sp_url

        # Loop through rows (tr) and identify by heading (th)
        for row in table.find_all('tr'):
            th = row.find('th')
            if th:
                # Artists
                if th.text.startswith('Single by'):
                    links = row.find_all('a')
                    if len(links)==1:
                        song_data.update({'artist':row.text.split('Single by ')[1], 'url':None})
                    else:
                        song_data['artists'] =[{'artist':artist.text, 
                                                'url':artist.attrs['href'], 
                                                'order':i} 
                                               for i, artist in enumerate(links[1:])]
                # Release Date
                elif th.text.startswith('Released'):
                    song_data.update({'release_date': clean_text(row.find('td').text)})

                # Genres
                elif th.text.startswith('Genre'):
                    song_data['genres'] = [clean_text(x.text) for x in row.find('td').find_all('a') if not re.match(re_ref,x.text)]

                # Length
                elif th.text.startswith('Length'):
                    song_data.update({'length': clean_text(row.find('td').text)})

                # Songwriters
                elif th.text.startswith('Songwriter'):
                    song_data['songwriters'] = get_hyper_list(row.find('td'), soup)

                # Producers
                elif th.text.startswith('Producer'):
                    song_data['producers'] = get_hyper_list(row.find('td'), soup)
    
        if out:
            with open(write_path/f'{song_id}.json', 'w') as wfile:
                json.dump(song_data, wfile)
            
        return song_data
    
    except IndexError:
        print(sp_url)
    
    

AttributeError: module 'src.tools.paths' has no attribute 'DATA_PATH'

In [390]:
pull_song_page_info(test_url.iloc[2], True)

{'artists': [{'artist': 'Marshmello', 'order': 0, 'url': '/wiki/Marshmello'},
  {'artist': 'Bastille', 'order': 1, 'url': '/wiki/Bastille_(band)'}],
 'genres': ['Dance-pop'],
 'id': 77162175,
 'length': '3:34',
 'producers': [{'name': 'Marshmello', 'url': None}],
 'release_date': 'August 17, 2018 (2018-08-17)',
 'songwriters': [{'name': 'Christopher Comstock', 'url': '/wiki/Marshmello'},
  {'name': 'Dan Smith', 'url': '/wiki/Dan_Smith_(singer)'},
  {'name': 'Steve Mac', 'url': '/wiki/Steve_Mac'}],
 'title': 'Happier'}

In [428]:
df.single_url.apply(lambda url: pull_song_page_info(url))

2     {'id': 67299063, 'title': 'All I Want for Chri...
3     {'id': 5946564, 'title': 'Shallow', 'url': '/w...
4     {'id': 77162175, 'title': 'Happier', 'url': '/...
5     {'id': 12497455, 'title': 'Sunflower', 'url': ...
6     {'id': 53890801, 'title': 'Without Me', 'url':...
7     {'id': 49790729, 'title': 'High Hopes', 'url':...
9     {'id': 27265974, 'title': 'Rockin' Around the ...
10    {'id': 90355098, 'title': 'Eastside', 'url': '...
11    {'id': 6130013, 'title': 'Wow.', 'url': '/wiki...
12    {'id': 4461425, 'title': '7 Rings', 'url': '/w...
13    {'id': 18979570, 'title': 'Middle Child', 'url...
14    {'id': 80016038, 'title': 'Break Up with Your ...
15    {'id': 11731290, 'title': 'Please Me', 'url': ...
16    {'id': 84218161, 'title': 'Thotiana', 'url': '...
17    {'id': 97778190, 'title': 'Sucker', 'url': '/w...
18    {'id': 60422908, 'title': 'Old Town Road', 'ur...
19    {'id': 13736376, 'title': 'Bad Guy', 'url': '/...
20    {'id': 82367964, 'title': 'Better', 'url':

In [437]:
df.single_url.apply(lambda url: gen_id(url))

2     67299063
3      5946564
4     77162175
5     12497455
6     53890801
7     49790729
9     27265974
10    90355098
11     6130013
12     4461425
13    18979570
14    80016038
15    11731290
16    84218161
17    97778190
18    60422908
19    13736376
20    82367964
21     8681010
22    65767323
23    19292924
24    90984281
25    14558672
26    98667084
27    89252462
28    17397613
29    64267869
30    80584227
31    53605876
32    95652930
33    72965531
34    20007765
35    42448484
36    11022441
37    23068890
38    43023752
39    41930197
40    98146605
41    11785515
42    81058939
43    88464997
44    63587066
45    56113991
46    31329164
47    94929658
48    66921307
49    69190510
50    61316978
51    19372734
52    24797159
Name: single_url, dtype: int64

In [451]:
pd.read_csv(list((settings.DATA_PATH/'wiki-top-10/').iterdir())[0],usecols=['single_url'])

Unnamed: 0,single_url
0,/wiki/Poor_Little_Fool
1,/wiki/Patricia_(Perez_Prado_song)
2,/wiki/Splish_Splash_(song)
3,/wiki/Hard_Headed_Woman
4,/wiki/When_(The_Kalin_Twins_song)
5,/wiki/Rebel_%27Rouser
6,/wiki/Yakety_Yak
7,/wiki/My_True_Love
8,/wiki/Willie_and_the_Hand_Jive
9,/wiki/Fever_(Little_Willie_John_song)


In [464]:
len(list((paths.DATA/'songs').glob('*')))

1838

In [469]:
song_idx = [int(x.stem) for x in(settings.DATA_PATH/'songs').glob('*')]

for year_csv in tqdm((settings.DATA_PATH/'wiki-top-10/').iterdir(), total=len(list((settings.DATA_PATH/'wiki-top-10/').glob('*'))) ):
    df = pd.read_csv(year_csv, usecols=['single_url'])
    for j, sp_url in df[df.single_url.notna()].single_url.iteritems():
        if gen_id(sp_url) not in song_idx:
            pull_song_page_info(sp_url)

HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))

/wiki/Are_You_Really_Mine%3F
/wiki/Topsy_(instrumental)
/wiki/The_All_American_Boy
/wiki/Tall_Paul
/wiki/Petite_Fleur
/wiki/The_Deck_of_Cards
/wiki/My_Empty_Arms
/wiki/The_Writing_on_the_Wall_(Adam_Wade_song)
/wiki/I_Understand_(Just_How_You_Feel)
/wiki/Tonight_(1956_song)
/wiki/More_(Theme_from_Mondo_Cane)
/wiki/GTO_(Ronny_%26_the_Daytonas_song)
/wiki/The_Clapping_Song
/wiki/Lara%27s_Theme
/wiki/Desiderata
/wiki/Jesu,_Joy_of_Man%27s_Desiring
/wiki/The_Americans_(commentary)
/wiki/Nadia%27s_Theme
/wiki/This_Could_Be_the_Night_(Loverboy_song)
/wiki/Scream/Childhood
/wiki/Auld_Lang_Syne
/wiki/Rain_on_Me
/wiki/Work_from_Home



### jcole

In [423]:
pull_song_page_info(jcole)

li text ['Jermaine Cole']
li text ['Allan Felder']
li text ['Norman Harris']
li text ['Tyler Williams']
li text ['T-Minus']
li text ['Cole']


{'artists': [{'artist': 'J. Cole', 'order': 0, 'url': '/wiki/J._Cole'}],
 'genres': ['Hip hop'],
 'id': 18979570,
 'length': '3:33',
 'producers': [{'name': 'T-Minus',
   'order': 0,
   'url': '/wiki/T-Minus_(record_producer)'},
  {'name': 'Cole', 'order': 1, 'url': None}],
 'release_date': 'January 23, 2019',
 'songwriters': [{'name': 'Jermaine Cole',
   'order': 0,
   'url': '/wiki/Jermaine_Cole'},
  {'name': 'Allan Felder', 'order': 1, 'url': '/wiki/Allan_Felder'},
  {'name': 'Norman Harris',
   'order': 2,
   'url': '/wiki/Norman_Harris_(musician)'},
  {'name': 'Tyler Williams',
   'order': 3,
   'url': '/wiki/T-Minus_(record_producer)'}],
 'title': 'Middle Child',
 'url': '/wiki/Middle_Child_(J._Cole_song)'}

# Testing

In [17]:
from importlib import reload
import sys
sys.path.append('..')
from scr import scrape_songpage as ssp

In [20]:
reload(ssp)
ssp.pull_info("/wiki/Poor_Little_Fool", False)

{'artists': [{'artist': 'Ricky Nelson',
   'order': 0,
   'url': '/wiki/Ricky_Nelson'}],
 'genres': ['Country'],
 'id': 42552041,
 'length': '2:32',
 'producers': [{'name': 'Ricky Nelson, Ozzie Nelson, Jimmie Haskell',
   'order': 0,
   'url': None}],
 'release_date': 'June 23, 1958',
 'songwriters': [{'name': 'Sharon Sheeley', 'order': 0, 'url': None}],
 'title': 'Poor Little Fool',
 'url': '/wiki/Poor_Little_Fool'}

In [8]:
dt.load_charts(2020)

Unnamed: 0,entry_date,single,artist,peak,peak_date,weeks_in_top,single_url
0,December 29(2018) 2020,"""It's the Most Wonderful Time of the Year""[A]",Andy Williams,7,January 4 2020,2,/wiki/It%27s_the_Most_Wonderful_Time_of_the_Year
1,January 5 2020,"""Jingle Bell Rock""",Bobby Helms,3,January 4 2020,3,/wiki/Jingle_Bell_Rock
2,January 5 2020,"""Holly Jolly Christmas""",Burl Ives,4,January 4 2020,4,/wiki/Holly_Jolly_Christmas
3,November 16 2020,"""Memories""",Maroon 5,2,January 11 2020,18,/wiki/Memories_(Maroon_5_song)
4,December 21 2020,"""Dance Monkey""[B]",Tones and I,4,February 29 2020,11,/wiki/Dance_Monkey
5,January 11 2020,"""The Box""",Roddy Ricch,1,January 18 2020,25,/wiki/The_Box_(Roddy_Ricch_song)
6,January 18 2020,"""Yummy"" ◁",Justin Bieber,2,January 18 2020,2,/wiki/Yummy_(Justin_Bieber_song)
7,January 25 2020,"""Life Is Good""[I] ◁",Future featuring Drake,2,January 25 2020,17,/wiki/Life_Is_Good_(song)
8,February 1 2020,"""Godzilla"" ◁",Eminem featuring Juice Wrld,3,February 1 2020,1,/wiki/Godzilla_(Eminem_song)
9,February 8 2020,"""Don't Start Now""",Dua Lipa,2,March 21 2020,20,/wiki/Don%27t_Start_Now_(Dua_Lipa_song)
