## Scrape data about "hot songs" from popvortex.com

In [144]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import sleep
from random import randint

In [2]:
url = "http://www.popvortex.com/music/charts/top-100-songs.php"

In [3]:
response = requests.get(url)
response.status_code

200

In [11]:
soup = BeautifulSoup(response.content, "html.parser")

In [70]:
soup_top100=soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper")[0].select("p")

In [67]:
title=[]
artist=[]
for song in soup_top100:
    if len(song.select("cite"))!=0:
        title.append(song.select("cite")[0].text)
    if len(song.select("em"))!=0:
        artist.append(song.select("em")[0].text)

In [139]:
df_pop_vortex_2022 = pd.DataFrame({"title":title,
                       "artist":artist
                      })

df_pop_vortex_2022['year']=2022

df_pop_vortex_2022.head()

Unnamed: 0,title,artist,year
0,Separate Ways (Worlds Apart) [feat. Lzzy Hale],Daughtry,2022
1,Unholy,Sam Smith & Kim Petras,2022
2,Heart Like A Truck,Lainey Wilson,2022
3,Anti-Hero,Taylor Swift,2022
4,Son Of A Sinner,Jelly Roll,2022


## Expand the project

### Defining functions for web scraping playbackfm song tables

In [151]:
def sleep_random(max_sec):
    random_sec=randint(1,max_sec*1000)/1000
    sleep(random_sec)
    return random_sec

In [164]:
def web_scrap_playbackfm(base_link,year):
    url=base_link+str(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    soup_top100=soup.select("#myTable")[0].select("a")
    ignore_index_lst1=range(0,400,4)
    ignore_index_lst2=range(3,400,4)
    cnt=0
    lst_title_artist=[]
    for song in soup_top100:
        if cnt in ignore_index_lst1 or cnt in ignore_index_lst2:
            cnt+=1
            continue
        else:
            lst_title_artist.append(song.text.replace("►","").strip())
        cnt+=1
    pop_songs_titles=[]
    pop_songs_artists=[]

    i=0

    while i+1< len(lst_title_artist):
        pop_songs_artists.append(lst_title_artist[i])
        pop_songs_titles.append(lst_title_artist[i+1])
        i+=2
    df_pop_songs = pd.DataFrame({"title":pop_songs_titles,
                           "artist":pop_songs_artists
                          })
    df_pop_songs['year']=year
    return df_pop_songs

### Extracting POP songs from 1900 to 2021 from playback.fm

In [158]:
df_pop_songs=[]
base_link="https://playback.fm/charts/top-100-songs/"
for year in range(1900,2022,1):
    print("Web Scraping Pop Songs for Year: ",year)
    df_pop_songs.append(web_scrap_playbackfm(base_link,year))
    print("Sleeping for ",sleep_random(4)," seconds..")
    

Web Scraping Pop Songs for Year:  1900
Sleeping for  1.962  seconds..
Web Scraping Pop Songs for Year:  1901
Sleeping for  0.765  seconds..
Web Scraping Pop Songs for Year:  1902
Sleeping for  2.67  seconds..
Web Scraping Pop Songs for Year:  1903
Sleeping for  0.72  seconds..
Web Scraping Pop Songs for Year:  1904
Sleeping for  3.414  seconds..
Web Scraping Pop Songs for Year:  1905
Sleeping for  1.305  seconds..
Web Scraping Pop Songs for Year:  1906
Sleeping for  2.545  seconds..
Web Scraping Pop Songs for Year:  1907
Sleeping for  0.915  seconds..
Web Scraping Pop Songs for Year:  1908
Sleeping for  3.055  seconds..
Web Scraping Pop Songs for Year:  1909
Sleeping for  0.863  seconds..
Web Scraping Pop Songs for Year:  1910
Sleeping for  1.494  seconds..
Web Scraping Pop Songs for Year:  1911
Sleeping for  3.263  seconds..
Web Scraping Pop Songs for Year:  1912
Sleeping for  1.906  seconds..
Web Scraping Pop Songs for Year:  1913
Sleeping for  1.249  seconds..
Web Scraping Pop Songs

Sleeping for  1.535  seconds..
Web Scraping Pop Songs for Year:  2018
Sleeping for  3.428  seconds..
Web Scraping Pop Songs for Year:  2019
Sleeping for  1.483  seconds..
Web Scraping Pop Songs for Year:  2020
Sleeping for  2.343  seconds..
Web Scraping Pop Songs for Year:  2021
Sleeping for  3.976  seconds..


In [161]:
df_pop_songs_all=pd.DataFrame({'title' : [], 'artist' : [], 'year' : []})

for df in df_pop_songs:
    df_pop_songs_all=pd.concat([df_pop_songs_all,df],axis=0)

df_pop_songs_all['genre']='pop'
df_pop_songs_all.to_csv("output_files/df_pop_songs.csv",index=False)

### Extracting ROCK songs from 1955 to 2021 from playback.fm

In [167]:
df_rock_songs=[]
base_link="https://playback.fm/charts/rock/"
for year in range(1955,2022,1):
    print("Web Scraping Rock Songs for Year: ",year)
    df_rock_songs.append(web_scrap_playbackfm(base_link,year))
    print("Sleeping for ",sleep_random(4)," seconds..")

Web Scraping Rock Songs for Year:  1955
Sleeping for  1.383  seconds..
Web Scraping Rock Songs for Year:  1956
Sleeping for  2.665  seconds..
Web Scraping Rock Songs for Year:  1957
Sleeping for  1.956  seconds..
Web Scraping Rock Songs for Year:  1958
Sleeping for  1.169  seconds..
Web Scraping Rock Songs for Year:  1959
Sleeping for  1.11  seconds..
Web Scraping Rock Songs for Year:  1960
Sleeping for  0.925  seconds..
Web Scraping Rock Songs for Year:  1961
Sleeping for  3.136  seconds..
Web Scraping Rock Songs for Year:  1962
Sleeping for  0.884  seconds..
Web Scraping Rock Songs for Year:  1963
Sleeping for  3.302  seconds..
Web Scraping Rock Songs for Year:  1964
Sleeping for  0.928  seconds..
Web Scraping Rock Songs for Year:  1965
Sleeping for  3.721  seconds..
Web Scraping Rock Songs for Year:  1966
Sleeping for  3.451  seconds..
Web Scraping Rock Songs for Year:  1967
Sleeping for  2.31  seconds..
Web Scraping Rock Songs for Year:  1968
Sleeping for  3.054  seconds..
Web Scra

In [168]:
df_rock_songs_all=pd.DataFrame({'title' : [], 'artist' : [], 'year' : []})

for df in df_rock_songs:
    df_rock_songs_all=pd.concat([df_rock_songs_all,df],axis=0)

df_rock_songs_all['genre']='rock'    
df_rock_songs_all.to_csv("output_files/df_rock_songs.csv",index=False)

### Extracting COUNTRY songs from 1944 to 2021 from playback.fm

In [171]:
df_country_songs=[]
base_link="https://playback.fm/charts/country/"
for year in range(1944,2022,1):
    print("Web Scraping Country Songs for Year: ",year)
    df_country_songs.append(web_scrap_playbackfm(base_link,year))
    print("Sleeping for ",sleep_random(4)," seconds..")

Web Scraping Country Songs for Year:  1944
Sleeping for  2.333  seconds..
Web Scraping Country Songs for Year:  1945
Sleeping for  2.117  seconds..
Web Scraping Country Songs for Year:  1946
Sleeping for  0.184  seconds..
Web Scraping Country Songs for Year:  1947
Sleeping for  1.926  seconds..
Web Scraping Country Songs for Year:  1948
Sleeping for  3.848  seconds..
Web Scraping Country Songs for Year:  1949
Sleeping for  1.094  seconds..
Web Scraping Country Songs for Year:  1950
Sleeping for  3.635  seconds..
Web Scraping Country Songs for Year:  1951
Sleeping for  2.745  seconds..
Web Scraping Country Songs for Year:  1952
Sleeping for  2.582  seconds..
Web Scraping Country Songs for Year:  1953
Sleeping for  2.935  seconds..
Web Scraping Country Songs for Year:  1954
Sleeping for  1.946  seconds..
Web Scraping Country Songs for Year:  1955
Sleeping for  2.166  seconds..
Web Scraping Country Songs for Year:  1956
Sleeping for  2.739  seconds..
Web Scraping Country Songs for Year:  

In [210]:
df_country_songs_all=pd.DataFrame({'title' : [], 'artist' : [], 'year' : []})

for df in df_country_songs:
    df_country_songs_all=pd.concat([df_country_songs_all,df],axis=0)

df_country_songs_all['genre']='country'    
df_country_songs_all.to_csv("output_files/df_country_songs.csv",index=False)

### Extracting R&B songs from 1942 to 2021 from playback.fm

In [211]:
df_rnb_songs=[]
base_link="https://playback.fm/charts/rnb/"
for year in range(1942,2022,1):
    print("Web Scraping RnB Songs for Year: ",year)
    df_rnb_songs.append(web_scrap_playbackfm(base_link,year))
    print("Sleeping for ",sleep_random(4)," seconds..")

Web Scraping RnB Songs for Year:  1942
Sleeping for  3.303  seconds..
Web Scraping RnB Songs for Year:  1943
Sleeping for  0.503  seconds..
Web Scraping RnB Songs for Year:  1944
Sleeping for  0.888  seconds..
Web Scraping RnB Songs for Year:  1945
Sleeping for  2.99  seconds..
Web Scraping RnB Songs for Year:  1946
Sleeping for  0.29  seconds..
Web Scraping RnB Songs for Year:  1947
Sleeping for  0.841  seconds..
Web Scraping RnB Songs for Year:  1948
Sleeping for  0.018  seconds..
Web Scraping RnB Songs for Year:  1949
Sleeping for  1.133  seconds..
Web Scraping RnB Songs for Year:  1950
Sleeping for  2.648  seconds..
Web Scraping RnB Songs for Year:  1951
Sleeping for  0.515  seconds..
Web Scraping RnB Songs for Year:  1952
Sleeping for  3.931  seconds..
Web Scraping RnB Songs for Year:  1953
Sleeping for  2.417  seconds..
Web Scraping RnB Songs for Year:  1954
Sleeping for  1.241  seconds..
Web Scraping RnB Songs for Year:  1955
Sleeping for  1.098  seconds..
Web Scraping RnB Songs

In [220]:
df_rnb_songs_all=pd.DataFrame({'title' : [], 'artist' : [], 'year' : []})

for df in df_rnb_songs:
    df_rnb_songs_all=pd.concat([df_rnb_songs_all,df],axis=0)

df_rnb_songs_all['genre']='rnb'    
df_rnb_songs_all.to_csv("output_files/df_rnb_songs.csv",index=False)

In [221]:
df_all_songs=pd.concat([df_pop_songs_all,df_rock_songs_all,df_country_songs_all,df_rnb_songs_all],axis=0)
df_all_songs.to_csv("output_files/df_all_songs.csv",index=False)

## Practice web scraping

Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

In [172]:
url = 'https://en.wikipedia.org/wiki/Python'
response = requests.get(url)
display(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

200

In [209]:
link_list=[]
for element in soup.find_all("a"):
    link=element.get("href")
    if link is not None:
        if ("/wiki" in link):
            if link.startswith('/wiki'):
                link_list.append(url+link)
            else:
                link_list.append(link)
display(link_list)

['https://en.wiktionary.org/wiki/Python',
 'https://en.wiktionary.org/wiki/python',
 'https://en.wikipedia.org/wiki/Python/wiki/Pythonidae',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_(genus)',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_(mythology)',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_(programming_language)',
 'https://en.wikipedia.org/wiki/Python/wiki/CMU_Common_Lisp',
 'https://en.wikipedia.org/wiki/Python/wiki/PERQ#PERQ_3',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_of_Aenus',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_(painter)',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_of_Byzantium',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_of_Catana',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_Anghelo',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_(Efteling)',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_(Busch_Gardens_Tampa_Bay)',
 'https://en.wikipedia.org/wiki/Python/wiki/Python_(Coney_Island,_Cincinnati,_O

Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [258]:
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [212]:
url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'
response = requests.get(url)
display(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

200

In [260]:
languages=[]
for element in soup.select("table")[0].select("tr td"):
    if len(element.select(".mw-redirect"))>0:
        languages.append(element.select(".mw-redirect")[0].text.strip())
    
display(languages)

27

In [261]:
population=[]
for element in soup.select("table")[0].select("tr td"):
    if element.text.strip().isnumeric() or isfloat(element.text.strip()):
        population.append(element.text.strip())
display(population)

27

In [262]:
df_popular_languages = pd.DataFrame({"language":languages,
                       "population":population
                      })
df_popular_languages.head()

Unnamed: 0,language,population
0,Mandarin Chinese,920
1,Spanish,475
2,English,373
3,Hindi,344
4,Bengali,234


In [265]:
df_popular_languages = df_popular_languages.astype({'population':'float'})

In [266]:
df_popular_languages.sort_values(by=['population'],ascending=False).head(10)

Unnamed: 0,language,population
0,Mandarin Chinese,920.0
1,Spanish,475.0
2,English,373.0
3,Hindi,344.0
4,Bengali,234.0
5,Portuguese,232.0
6,Russian,154.0
7,Japanese,125.0
8,Yue Chinese,85.2
9,Vietnamese,84.6
