In [1]:
import pandas as pd
import numpy as np
import requests
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import os
from datetime import datetime, timedelta
from time import sleep
from bs4 import BeautifulSoup
from typing import Tuple, List, NamedTuple
from db_management.db import SongInfo, IDSongInfo, SongsContainer, SongsDB, DBException

In [2]:
CLIENT_ID = os.environ["SPOTIFY_CLIENT_ID"]
CLIENT_SECRET = os.environ["SPOTIFY_CLIENT_SECRET"]

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, redirect_uri='http://example.com', scope="playlist-modify-public"))

track_name = "Save Your Tears"
artist_name = "Weeknd"

#a = sp.search(q=f"track: {track_name} artist: {artist_name}", type='track', limit=1)

In [3]:
b = sp.playlist("4ZuX2YvKAlym0a8VozqV1U")

In [3]:
ENDPOINT_CHARTS = "https://www.officialcharts.com/charts/singles-chart/%s/7501/"

def generate_dates(week_gap: int = 2, years_back: int = 50):

    today = datetime.now()
    start_date = today - timedelta(days=365*years_back)
    print(len([n for n in range(int((today-start_date).days//(week_gap*7)))]))
    
    for n in range(int((today - start_date).days//(week_gap*7))):
        yield start_date + timedelta(days=n*week_gap*7)

In [4]:
def retrieve_top_songs(date: str, container: SongsContainer) -> SongsContainer:
    try:
        response = requests.get(ENDPOINT_CHARTS % date)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        items = soup.find_all("div", {"class": "chart-item"})

        for item in items:
            try:
                song = item.find("a", {"class": "chart-name"}).find("span", {"class": None})
                artist = item.find("a", {"class": "chart-artist"}).find("span", {"class": None})
    
                container.add_song(SongInfo(artist.text, song.text))
            except:
                continue
        return container
        
    except Exception as exception:
        print(exception)

In [107]:
songs = SongsContainer()
for idx, single_date in enumerate(generate_dates(week_gap=1, years_back=60)):
    date = single_date.strftime("%Y%m%d")
    retrieve_top_songs(date, songs)
    if idx % 101 == 100:
        print(idx, date)

3128
100 19660615
201 19680522
302 19700429
403 19720405
504 19740313
605 19760218
706 19780125
807 19800102
908 19811209
1009 19831116
1110 19851023
1211 19870930
1312 19890906
1413 19910814
1514 19930721
1615 19950628
1716 19970604
1817 19990512
1918 20010418
2019 20030326
2120 20050302
2221 20070207
2322 20090114
2423 20101222
2524 20121128
2625 20141105
2726 20161012
2827 20180919
2928 20200826
3029 20220803


In [108]:
len(songs)

43220

In [109]:
#with open("songs69.csv", "w", encoding="utf-8") as file:
#    file.write(songs.get_csv())

In [6]:
#songs = SongsContainer()
#songs.from_csv("songs2.csv")

In [5]:
database = SongsDB()

In [6]:
database.songs_populate_csv("songs2.csv")

In [6]:
for idx, title, artist in database.get_scraped_songs()[:5]:
    print(title, artist)

HOUSE OF THE RISING SUN THE ANIMALS
IT'S ALL OVER NOW THE ROLLING STONES
HOLD ME P J PROBY
IT'S OVER ROY ORBISON
RAMONA THE BACHELORS


In [12]:
database.close_connection()

In [7]:
def get_song_id(s: SongInfo) -> IDSongInfo:
    ext_info = sp.search(q=f"track: {s.song} artist: {s.artist}", type='track', limit=1)
    item = ext_info["tracks"]["items"][0]
    
    song_id = item["id"]
    album_id = item["album"]["id"]
    artist_id = item["artists"][0]["id"]
    title = item["name"]
    release_date = item["album"]["release_date"]
    featured = int(len(item["artists"]) > 1)
    popularity = item["popularity"]

    return IDSongInfo(song_id, album_id, artist_id, title, release_date, featured, popularity)

In [8]:
start = datetime.now()

skipped = 0

for idx, title, artist in database.get_scraped_songs()[24400:]:
    try:
        database.songs_insert(get_song_id(SongInfo(title, artist)))
        if idx % 50 == 0 and idx != 0:
            print(f"{idx}: cooldown 5s... TIME:{datetime.now()-start} SKIPPED: {skipped}")
            sleep(5)
    except DBException as exception:
        skipped += 1
        continue
    except Exception as gen_exception:
        print(gen_exception)
        sleep(10)

24450: cooldown 5s... TIME:0:00:17.342091 SKIPPED: 39
24500: cooldown 5s... TIME:0:00:40.385582 SKIPPED: 79
24650: cooldown 5s... TIME:0:01:36.293843 SKIPPED: 192
24850: cooldown 5s... TIME:0:02:49.205855 SKIPPED: 349
24950: cooldown 5s... TIME:0:03:27.405053 SKIPPED: 427
25000: cooldown 5s... TIME:0:03:50.034166 SKIPPED: 461
25100: cooldown 5s... TIME:0:04:28.863586 SKIPPED: 492
25150: cooldown 5s... TIME:0:04:52.000217 SKIPPED: 503
25200: cooldown 5s... TIME:0:05:14.106779 SKIPPED: 519
25250: cooldown 5s... TIME:0:05:37.031791 SKIPPED: 532
25300: cooldown 5s... TIME:0:05:59.165291 SKIPPED: 544
25400: cooldown 5s... TIME:0:06:40.137510 SKIPPED: 570
25450: cooldown 5s... TIME:0:07:03.065331 SKIPPED: 583
25500: cooldown 5s... TIME:0:07:28.264610 SKIPPED: 596
25550: cooldown 5s... TIME:0:07:53.641632 SKIPPED: 608
25850: cooldown 5s... TIME:0:09:48.652726 SKIPPED: 679
25950: cooldown 5s... TIME:0:10:31.560313 SKIPPED: 708
26000: cooldown 5s... TIME:0:10:55.938681 SKIPPED: 722
26050: coold

In [33]:
songs_id.save_to_csv("songs_test.csv")

In [10]:
playlists_names = [f"{year} hits" for year in range(1950, 2010, 10)]
playlists_names

['1950 hits', '1960 hits', '1970 hits', '1980 hits', '1990 hits', '2000 hits']

In [20]:
def get_playlists_songs(query: str):
    response = sp.search(q=query, type='playlist', limit=10)
    for it in res["playlists"]["items"]:
        name = it["name"].lower()
        if "polsk" not in name and "lat" not in name and "hity" not in name:
            id = it["id"]
            playlist = sp.playlist(id)
            
            
    return response

In [29]:
res = get_playlists_songs("1950 hits")
res

{'playlists': {'href': 'https://api.spotify.com/v1/search?query=1950+hits&type=playlist&offset=0&limit=10',
  'items': [{'collaborative': False,
    'description': 'The biggest songs of the 1950s. Cover: Elvis Presley',
    'external_urls': {'spotify': 'https://open.spotify.com/playlist/37i9dQZF1DWSV3Tk4GO2fq'},
    'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1DWSV3Tk4GO2fq',
    'id': '37i9dQZF1DWSV3Tk4GO2fq',
    'images': [{'height': None,
      'url': 'https://i.scdn.co/image/ab67706f00000003f3fdf2c92b1c70814b31af06',
      'width': None}],
    'name': 'All Out 50s',
    'owner': {'display_name': 'Spotify',
     'external_urls': {'spotify': 'https://open.spotify.com/user/spotify'},
     'href': 'https://api.spotify.com/v1/users/spotify',
     'id': 'spotify',
     'type': 'user',
     'uri': 'spotify:user:spotify'},
    'primary_color': None,
    'public': None,
    'snapshot_id': 'MTcxNjU5Njc5OCwwMDAwMDAwMDM0N2M4NTZiZGQ4NmQzZjFkMmYzYzQzNDQ4MmY5ZGZk',
    'tracks': {'href

In [30]:
playlist = sp.playlist("37i9dQZF1DWSV3Tk4GO2fq")

In [43]:
for it in playlist["tracks"]["items"][:5]:
    it = it["track"]
    album_id = it["album"]["id"]
    artist_id = it["artists"][0]["id"]
    song_id = it["id"]
    popularity = it["popularity"]
    duration = it["duration_ms"]
    featured = int(len(it["artists"]) > 1)
    title = it["name"]
    release_date = it["album"]["release_date"]

    print(album_id, artist_id, song_id, popularity, duration, featured, title, release_date)

0C3t1htEDTFKcg7F2rNbek 43ZHCT0cAZBISjO8DG9PnE 4gphxUgq0JSFv2BCLhNDiE 66 146480 0 Jailhouse Rock 1958-03-21
2Qw0tZfNYrKgAqqyrwDUkY 2iSZ41LUUYVcz5At9Xuz7T 6Et6uBEIIDEforjX21VLZi 42 147320 0 Sh-Boom 1954-02-18
6U60FpmscwzTJjc9gmZcKl 6bR0cgMtkCVpm0I5yrDNzO 5Oc0vLGWdEWeCqIU8zyELt 55 181000 0 Hound Dog 1989-01-01
6mmv0gwumlFGWDGJXF4yEv 19eLuQmk9aCobbVDHc6eek 3HuJDcOWx0gE9Yng2uWY7K 64 196840 0 Dream A Little Dream Of Me 1968-01-01
16BF4fvjjxlzrdKrUl4k5F 602DnpaSXJB4b9DZrvxbDc 2CeqxyOZEyiL6pTDYZ9gPH 54 201333 0 Fever 1988-01-01


In [44]:
!git status

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   dataset-creator.ipynb
	modified:   db_management/__pycache__/db.cpython-311.pyc
	modified:   db_management/db.py

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	db_management/data/
	db_management/other/

no changes added to commit (use "git add" and/or "git commit -a")


'vim' is not recognized as an internal or external command,
operable program or batch file.
