# GNOD PROJECT

### Goals:
> To improve our music recommender Gnoosic by giving users two new possibilities when searching for recommendations:\
>\
> 1-) Songs that are actually similar to the ones they picked from an acoustic point of view.\
> 2-) Songs that are popular around the world right now, independently from their tastes.
>
### How:
> 1-) Collect new data sources for songs; APIs and/or web scrapping.\
> 2-) Create clusters of songs that are similar to each other.\
> 3-) Prioritize the recommendations of songs from that same group.

## GNOD PROJECT PART 1

In [1]:
import requests

from bs4 import BeautifulSoup

import pandas as pd

In [2]:
url='https://www.popvortex.com/music/charts/top-100-songs.php'

response = requests.get(url)
response.status_code 

soup = BeautifulSoup(response.content, "html.parser")
#soup

soup.select("#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p")  # copy SELECTOR


[<p class="title-artist"><cite class="title">Last Night</cite><em class="artist">Morgan Wallen</em></p>]

In [3]:
# Creating empty lists
title = []
artist = []

# Assigning the whole selection of Song & Artist
num_iter = len(soup.select('div.chart-content.col-xs-12.col-sm-8 > p'))



num_iter

# Separating the song and artist
artist_list = soup.select('div.chart-content.col-xs-12.col-sm-8 > p > em')
title_list = soup.select('div.chart-content.col-xs-12.col-sm-8 > p > cite')



# iterate through the result set and retrive all the data
for i in range(num_iter):
    artist.append(artist_list[i].get_text())
    title.append(title_list[i].get_text())

#print(title)
#print(artist)

In [4]:
# Storing it to a dataframe
top100_2023 = pd.DataFrame({"Artist":artist, "Song":title})
top100_2023

Unnamed: 0,Artist,Song
0,Morgan Wallen,Last Night
1,Miley Cyrus,Flowers
2,Luke Combs,Fast Car
3,Jimin,Like Crazy (Deep House Remix)
4,Jimin,Like Crazy (UK Garage Remix)
...,...,...
95,Luke Combs,"Going, Going, Gone"
96,Russ,NASTY
97,Bad Omens,Just Pretend
98,Rema,Calm Down


In [5]:
year=2023

In [6]:
top100_2023['Year']=year

In [7]:
top100_2023

Unnamed: 0,Artist,Song,Year
0,Morgan Wallen,Last Night,2023
1,Miley Cyrus,Flowers,2023
2,Luke Combs,Fast Car,2023
3,Jimin,Like Crazy (Deep House Remix),2023
4,Jimin,Like Crazy (UK Garage Remix),2023
...,...,...,...
95,Luke Combs,"Going, Going, Gone",2023
96,Russ,NASTY,2023
97,Bad Omens,Just Pretend,2023
98,Rema,Calm Down,2023


#### Expanding

In [8]:

from time import sleep
from random import randint


In [9]:
def get_top100(start_year, end_year):
    all_top100 = pd.DataFrame()
    for year in range(start_year, end_year+1):
        url = f"https://playback.fm/charts/top-100-songs/{year}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        my_table = soup.find('table', {'class': 'chartTbl'})
        songs = my_table.findAll('a')
        top100 = []
        for i in range(0, len(soup.select('.song')), 2):
            artist = soup.select('.artist')[i//2].text.strip()
            song = soup.select('.song')[i].text.strip()
            top100.append({'Artist': artist, 'Song': song})
        top100_df = pd.DataFrame(top100)
        top100_df['Year'] = year
        all_top100 = pd.concat([all_top100, top100_df])
    for i in range(8):
        print(i)
        wait_time = randint(1,4000)
        print("I will sleep for " + str(wait_time/1000) + " seconds.")
        sleep(wait_time/1000)
    return all_top100
  
top100_all_years = get_top100(2010, 2017)  # i can select here from 1900 to 2021 but it takes ages to load. Limiting a bit
top100_all_years

0
I will sleep for 2.374 seconds.
1
I will sleep for 2.798 seconds.
2
I will sleep for 0.948 seconds.
3
I will sleep for 2.321 seconds.
4
I will sleep for 2.688 seconds.
5
I will sleep for 1.422 seconds.
6
I will sleep for 2.438 seconds.
7
I will sleep for 2.95 seconds.


Unnamed: 0,Artist,Song,Year
0,Eminem & Rihanna,Love The Way You Lie,2010
1,Bruno Mars,Just the Way You Are,2010
2,Katy Perry & Snoop Dogg,California Gurls,2010
3,Katy Perry,Firework,2010
4,Rihanna,Only girl (in the world),2010
...,...,...,...
95,Kane Brown featuring Lauren Alaina,What Ifs,2017
96,Luke Combs,Hurricane,2017
97,Playboi Carti,Magnolia,2017
98,Camila Cabello,Never Be the Same,2017


In [None]:
top100_all_years=top100_all_years.iloc[::-1]  # inverting the dataframe

#### Disclaimer: I have used a combination of stackoverflow and chatGPT to get the final codes

> Top 10 Cheating Songs of all time... for the comic relief - for now I am commeting it out

# url2='https://playback.fm/top-songs-about-cheating'

response2 = requests.get(url2)
response2.status_code 

soup2 = BeautifulSoup(response2.content, "html.parser")
#soup

soup2.select("body > div.wrappercenter > div.wrapperbody.cf > div.playlist.post > div:nth-child(2) > h2")  # copy SELECTOR

soup2.select("h2") # copy SELECTOR

top_cheating=[]
num_iter = len(soup2.select('h2'))
cheating_list=soup2.select("h2")
for i in range(num_iter):
    top_cheating.append(cheating_list[i].get_text())
    

top_cheating

cleaned_list = [song.strip() for song in top_cheating if song not in ['Top 10 Country Songs in 1958', 'Some cool stuff...', 'You might also like']]

cleaned_list 

song_artist_list = []
for item in cleaned_list:
    song, artist = item.split("by")
    song_artist_dict = {"song": song.strip(), "artist": artist.strip()}
    song_artist_list.append(song_artist_dict)

song_artist_list 

top10_cheating=pd.DataFrame(song_artist_list)

top10_cheating

top10_cheating = top10_cheating.iloc[::-1]

top10_cheating

## END OF PART 1 / START OF PART 2

##### Bringing the list of songs together. For now I have just selected Top 100 from the past 10 years

In [10]:
df=pd.concat([top100_2023, top100_all_years], axis=0).reset_index(drop=True)
df

Unnamed: 0,Artist,Song,Year
0,Morgan Wallen,Last Night,2023
1,Miley Cyrus,Flowers,2023
2,Luke Combs,Fast Car,2023
3,Jimin,Like Crazy (Deep House Remix),2023
4,Jimin,Like Crazy (UK Garage Remix),2023
...,...,...,...
893,Kane Brown featuring Lauren Alaina,What Ifs,2017
894,Luke Combs,Hurricane,2017
895,Playboi Carti,Magnolia,2017
896,Camila Cabello,Never Be the Same,2017


In [11]:
df['Artist'] = df['Artist'].str.lower()
df['Song'] = df['Song'].str.lower()

In [12]:
df.shape

(898, 3)

In [None]:
#creating a input built into a function to generate a random choice of songs that are NOT the one just added by the user

In [41]:
def random_music(df):
    preferred_song = input("Enter a song you like: ")
    if preferred_song in df['Song'].values:
        random_song = df[df['Song'] != preferred_song].sample()
        recommended_song = random_song['Song'].item()
        recommended_artist = random_song['Artist'].item()
        print('Here is a recommendation of a song you might dig: {} by {}'.format(recommended_song, recommended_artist))
    else:
        print('Sorry, we don´t have a recommendation for that song')

In [42]:
random_music(df)

Enter a song you like: hurricane
Here is a recommendation of a song you might dig: get lucky by daft punk
