# GNOD PROJECT

### Goals:
> To improve our music recommender Gnoosic by giving users two new possibilities when searching for recommendations:\
>\
> 1-) Songs that are actually similar to the ones they picked from an acoustic point of view.\
> 2-) Songs that are popular around the world right now, independently from their tastes.
>
### How:
> 1-) Collect new data sources for songs; APIs and/or web scrapping.\
> 2-) Create clusters of songs that are similar to each other.\
> 3-) Prioritize the recommendations of songs from that same group.

## GNOD PROJECT PART 1

In [1]:
import requests

from bs4 import BeautifulSoup

import pandas as pd

In [2]:
url='https://www.popvortex.com/music/charts/top-100-songs.php'

response = requests.get(url)
response.status_code 

soup = BeautifulSoup(response.content, "html.parser")
#soup

soup.select("#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p")  # copy SELECTOR


[<p class="title-artist"><cite class="title">Last Night</cite><em class="artist">Morgan Wallen</em></p>]

In [3]:
# Creating empty lists
title = []
artist = []

# Assigning the whole selection of Song & Artist
num_iter = len(soup.select('div.chart-content.col-xs-12.col-sm-8 > p'))



num_iter

# Separating the song and artist
title_list = soup.select('div.chart-content.col-xs-12.col-sm-8 > p > cite')
artist_list = soup.select('div.chart-content.col-xs-12.col-sm-8 > p > em')


# iterate through the result set and retrive all the data
for i in range(num_iter):
    title.append(title_list[i].get_text())
    artist.append(artist_list[i].get_text())

#print(title)
#print(artist)

In [4]:
# Storing it to a dataframe
top100 = pd.DataFrame({"song":title,"artist":artist})
top100

Unnamed: 0,song,artist
0,Last Night,Morgan Wallen
1,Flowers,Miley Cyrus
2,Fast Car,Luke Combs
3,Like Crazy (Deep House Remix),Jimin
4,Like Crazy (UK Garage Remix),Jimin
...,...,...
95,Head Above Water,Avril Lavigne
96,10:35,Tiësto & Tate McRae
97,Trump Won,Natasha Owens
98,I'm Good (Blue),David Guetta & Bebe Rexha


#### END OF PART 1

#### Expanding

> <b>Top 100 songs in 2021

In [5]:
url3='https://playback.fm/charts/top-100-songs/2021'

response3 = requests.get(url3)
response3.status_code 

soup3 = BeautifulSoup(response3.content, "html.parser")
#soup3

my_table=soup3.find('table',{'class':'chartTbl'})

In [6]:
#my_table

In [7]:
songs = my_table.findAll('a')

In [8]:
#songs

In [9]:
top100 = []
for i in range(0, len(soup3.select('.song')), 2):
    artist = soup3.select('.artist')[i//2].text.strip()
    song = soup3.select('.song')[i].text.strip()
    top100.append({'Artist': artist, 'Song': song})

top100_2021 = pd.DataFrame(top100)

In [10]:
top100_2021

Unnamed: 0,Artist,Song
0,Dua Lipa & DaBaby,Levitating
1,Olivia Rodrigo,Drivers License
2,The Weeknd & Ariana Grande,Save Your Tears
3,Lil Nas X,Montero (Call Me by Your Name)
4,The Weeknd,Blinding Lights
...,...,...
95,Marshmello & Jonas Brothers,Leave Before You Love Me
96,Maneskin,Beggin
97,Chris Young + Kane Brown,Famous Friends
98,Nelly & Florida Georgia Line,Lil Bit


> <b>Top 100 - 2020

In [11]:
url4='https://playback.fm/charts/top-100-songs/2020'

response4 = requests.get(url4)
response4.status_code 

soup4 = BeautifulSoup(response4.content, "html.parser")
#soup3

my_table2=soup4.find('table',{'class':'chartTbl'})

songs = my_table2.findAll('a')

top100_2 = []
for i in range(0, len(soup4.select('.song')), 2):
    artist = soup4.select('.artist')[i//2].text.strip()
    song = soup4.select('.song')[i].text.strip()
    top100_2.append({'Artist': artist, 'Song': song})

top100_2020 = pd.DataFrame(top100_2)

top100_2020

Unnamed: 0,Artist,Song
0,The Weeknd,Blinding Lights
1,Post Malone,Circles
2,Cardi B featuring Megan Thee Stallion,Wap
3,Dua Lipa,Don't Start Now
4,Roddy Ricch,The Box
...,...,...
95,Ava Max,Kings & Queens
96,AJR,Bang!
97,Juice Wrld & YoungBoy Never Broke Again,Bandit
98,Dua Lipa,Physical


> <b>Top 100 - 2019

In [12]:
url5='https://playback.fm/charts/top-100-songs/2019'

response5 = requests.get(url5)
response5.status_code 

soup5 = BeautifulSoup(response5.content, "html.parser")

my_table3=soup5.find('table',{'class':'chartTbl'})

songs = my_table3.findAll('a')

top100_3 = []
for i in range(0, len(soup5.select('.song')), 2):
    artist = soup5.select('.artist')[i//2].text.strip()
    song = soup5.select('.song')[i].text.strip()
    top100_3.append({'Artist': artist, 'Song': song})

top100_2019 = pd.DataFrame(top100_3)

top100_2019


Unnamed: 0,Artist,Song
0,Lil Nas X featuring Billy Ray Cyrus,Old Town Road
1,Billie Eilish,Bad Guy
2,Post Malone & Swae Lee,Sunflower (Spider-Man: Into The Spider-Verse)
3,Ariana Grande,7 Rings
4,Halsey,Without Me
...,...,...
95,Blackbear,Hot Girl Bummer
96,Luke Combs,Even Though I'm Leaving
97,Summer Walker,Playing Games
98,Post Malone & Ozzy Osbourne & Travi$ Scott,Take What You Want


In [13]:
url6='https://playback.fm/charts/top-100-songs/2018'

response6 = requests.get(url6)
response6.status_code 

soup6 = BeautifulSoup(response6.content, "html.parser")

my_table4=soup6.find('table',{'class':'chartTbl'})

songs = my_table4.findAll('a')

top100_4 = []
for i in range(0, len(soup6.select('.song')), 2):
    artist = soup6.select('.artist')[i//2].text.strip()
    song = soup6.select('.song')[i].text.strip()
    top100_4.append({'Artist': artist, 'Song': song})

top100_2018 = pd.DataFrame(top100_4)

top100_2018

Unnamed: 0,Artist,Song
0,Drake,God's Plan
1,Ed Sheeran,Perfect
2,Maroon 5 featuring Cardi B,Girls Like You
3,Bebe Rexha & Florida Georgia Line,Meant To Be
4,Camila Cabello featuring Young Thug,Havana
...,...,...
95,Demi Lovato,Sorry Not Sorry
96,MAX featuring gnash,Lights Down Low
97,"Jonas Blue, Jack & Jack",Rise
98,Khalid,Young Dumb & Broke


#### maybe using a function and the last year of the url might work better...

In [14]:
def get_top100(start_year, end_year):
    all_top100 = pd.DataFrame()
    for year in range(start_year, end_year+1):
        url = f"https://playback.fm/charts/top-100-songs/{year}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        my_table = soup.find('table', {'class': 'chartTbl'})
        songs = my_table.findAll('a')
        top100 = []
        for i in range(0, len(soup.select('.song')), 2):
            artist = soup.select('.artist')[i//2].text.strip()
            song = soup.select('.song')[i].text.strip()
            top100.append({'Artist': artist, 'Song': song})
        top100_df = pd.DataFrame(top100)
        top100_df['Year'] = year
        all_top100 = pd.concat([all_top100, top100_df])
    return all_top100
  
top100_all_years = get_top100(1990, 2017)  # i can select here from 1900 to 2021 but it takes ages to load. Limiting a bit
top100_all_years

Unnamed: 0,Artist,Song,Year
0,Sinead O'Connor,Nothing Compares 2 U,1990
1,Madonna,Vogue,1990
2,Londonbeat,I've Been Thinking About You,1990
3,MC Hammer,U Can't Touch This,1990
4,Vanilla Ice,Ice Ice Baby,1990
...,...,...,...
95,Kane Brown featuring Lauren Alaina,What Ifs,2017
96,Luke Combs,Hurricane,2017
97,Playboi Carti,Magnolia,2017
98,Camila Cabello,Never Be the Same,2017


In [29]:
# a modified version of the function, to get just one year
def get_top100(year):
    url = f"https://playback.fm/charts/top-100-songs/{year}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    my_table = soup.find('table', {'class': 'chartTbl'})
    songs = my_table.findAll('a')
    top100 = []
    for i in range(0, len(soup.select('.song')), 2):
        artist = soup.select('.artist')[i//2].text.strip()
        song = soup.select('.song')[i].text.strip()
        top100.append({'Artist': artist, 'Song': song})
    return pd.DataFrame(top100)

top100_1900 = get_top100(1900)
top100_1900

Unnamed: 0,Artist,Song
0,Arthur Collins,Ma Tiger Lily
1,Albert Campbell,Ma Blushin' Rosie
2,George J Gaskin,When Cloe Sings a Song
3,Haydn Quartet,Because
4,George J Gaskin,When You Were Sweet Sixteen
5,Louis Bradfield,I Want to Be a Military Man
6,Arthur Collins,Mandy Lee
7,Jere Mahoney,When You Were Sweet Sixteen
8,Len Spencer,Ma Tiger Lily
9,Jere Mahoney,A Bird in a Gilded Cage


> Top 10 Cheating Songs of all time... for the comic relief

In [15]:
url2='https://playback.fm/top-songs-about-cheating'

response2 = requests.get(url2)
response2.status_code 

soup2 = BeautifulSoup(response2.content, "html.parser")
#soup

soup2.select("body > div.wrappercenter > div.wrapperbody.cf > div.playlist.post > div:nth-child(2) > h2")  # copy SELECTOR

[<h2>
 Your Cheatin’ Heart by Hank Williams
 </h2>]

In [16]:
soup2.select("h2") # copy SELECTOR

[<h2>
 Your Cheatin’ Heart by Hank Williams
 </h2>,
 <h2>
 Lyin’ Eyes by Eagles
 </h2>,
 <h2>
 Pale Blue Eyes by the Velvet Underground
 </h2>,
 <h2>
 Me and Mrs. Jones by Billy Paul
 </h2>,
 <h2>
 You Make Loving Fun by Fleetwood Mac
 </h2>,
 <h2>
 You Know I'm No Good by Amy Winehouse
 </h2>,
 <h2>
 You Oughta Know by Alanis Morissette
 </h2>,
 <h2>
 The Weakness in Me by Joan Armatrading
 </h2>,
 <h2>
 I Heard It Through the Grapevine by Marvin Gaye
 </h2>,
 <h2>
 Jolene by Dolly Parton
 </h2>,
 <h2>Top 10 Country Songs in 1958</h2>,
 <h2 class="margin top cf">Some cool stuff...</h2>,
 <h2>You might also like</h2>]

In [17]:
top_cheating=[]
num_iter = len(soup2.select('h2'))
cheating_list=soup2.select("h2")
for i in range(num_iter):
    top_cheating.append(cheating_list[i].get_text())
    

In [18]:
top_cheating

['\nYour Cheatin’ Heart by Hank Williams\n',
 '\nLyin’ Eyes by Eagles\n',
 '\nPale Blue Eyes by the Velvet Underground\n',
 '\nMe and Mrs. Jones by Billy Paul\n',
 '\nYou Make Loving Fun by Fleetwood Mac\n',
 "\nYou Know I'm No Good by Amy Winehouse\n",
 '\nYou Oughta Know by Alanis Morissette\n',
 '\nThe Weakness in Me by Joan Armatrading\n',
 '\nI Heard It Through the Grapevine by Marvin Gaye\n',
 '\nJolene by Dolly Parton\n',
 'Top 10 Country Songs in 1958',
 'Some cool stuff...',
 'You might also like']

In [19]:
cleaned_list = [song.strip() for song in top_cheating if song not in ['Top 10 Country Songs in 1958', 'Some cool stuff...', 'You might also like']]

In [20]:
cleaned_list 

['Your Cheatin’ Heart by Hank Williams',
 'Lyin’ Eyes by Eagles',
 'Pale Blue Eyes by the Velvet Underground',
 'Me and Mrs. Jones by Billy Paul',
 'You Make Loving Fun by Fleetwood Mac',
 "You Know I'm No Good by Amy Winehouse",
 'You Oughta Know by Alanis Morissette',
 'The Weakness in Me by Joan Armatrading',
 'I Heard It Through the Grapevine by Marvin Gaye',
 'Jolene by Dolly Parton']

In [21]:
song_artist_list = []
for item in cleaned_list:
    song, artist = item.split("by")
    song_artist_dict = {"song": song.strip(), "artist": artist.strip()}
    song_artist_list.append(song_artist_dict)

In [22]:
song_artist_list 

[{'song': 'Your Cheatin’ Heart', 'artist': 'Hank Williams'},
 {'song': 'Lyin’ Eyes', 'artist': 'Eagles'},
 {'song': 'Pale Blue Eyes', 'artist': 'the Velvet Underground'},
 {'song': 'Me and Mrs. Jones', 'artist': 'Billy Paul'},
 {'song': 'You Make Loving Fun', 'artist': 'Fleetwood Mac'},
 {'song': "You Know I'm No Good", 'artist': 'Amy Winehouse'},
 {'song': 'You Oughta Know', 'artist': 'Alanis Morissette'},
 {'song': 'The Weakness in Me', 'artist': 'Joan Armatrading'},
 {'song': 'I Heard It Through the Grapevine', 'artist': 'Marvin Gaye'},
 {'song': 'Jolene', 'artist': 'Dolly Parton'}]

In [23]:
top10_cheating=pd.DataFrame(song_artist_list)

In [24]:
top10_cheating

Unnamed: 0,song,artist
0,Your Cheatin’ Heart,Hank Williams
1,Lyin’ Eyes,Eagles
2,Pale Blue Eyes,the Velvet Underground
3,Me and Mrs. Jones,Billy Paul
4,You Make Loving Fun,Fleetwood Mac
5,You Know I'm No Good,Amy Winehouse
6,You Oughta Know,Alanis Morissette
7,The Weakness in Me,Joan Armatrading
8,I Heard It Through the Grapevine,Marvin Gaye
9,Jolene,Dolly Parton


In [25]:
top10_cheating = top10_cheating.iloc[::-1]

In [26]:
top10_cheating

Unnamed: 0,song,artist
9,Jolene,Dolly Parton
8,I Heard It Through the Grapevine,Marvin Gaye
7,The Weakness in Me,Joan Armatrading
6,You Oughta Know,Alanis Morissette
5,You Know I'm No Good,Amy Winehouse
4,You Make Loving Fun,Fleetwood Mac
3,Me and Mrs. Jones,Billy Paul
2,Pale Blue Eyes,the Velvet Underground
1,Lyin’ Eyes,Eagles
0,Your Cheatin’ Heart,Hank Williams
