In [1]:
import sys; sys.path.append('..')
import os
from dotenv import load_dotenv
import pandas as pd
from tqdm.notebook import tqdm

# maybe move to config file
load_dotenv()
src_dir = os.getenv('SRC_DIR')
assert(src_dir)
os.chdir(src_dir)

from src.spotify import Spotify
import src.file_utils 

spot = Spotify()

In [2]:
# testing spotipy
test_song = 'https://open.spotify.com/track/7CRPXJt3IVC7V0VcM0PSQi?si=79e0c2a7a20d49eb'

info = spot.get_track_info(test_song)
print(info)


{'track_name': 'Diddy Bop', 'track_pop': 59, 'artist': 'Noname', 'artist_pop': 52, 'album': 'Telefone', 'length': 208000, 'track_uri': 'spotify:track:7CRPXJt3IVC7V0VcM0PSQi', 'danceability': 0.637, 'energy': 0.503, 'key': 1, 'loudness': -6.927, 'mode': 1, 'speechiness': 0.336, 'acousticness': 0.713, 'instrumentalness': 0.000113, 'liveness': 0.109, 'valence': 0.661, 'tempo': 81.061, 'type': 'audio_features', 'id': '7CRPXJt3IVC7V0VcM0PSQi', 'uri': 'spotify:track:7CRPXJt3IVC7V0VcM0PSQi', 'track_href': 'https://api.spotify.com/v1/tracks/7CRPXJt3IVC7V0VcM0PSQi', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7CRPXJt3IVC7V0VcM0PSQi', 'duration_ms': 208000, 'time_signature': 4}


In [5]:
# directory of original data
DATAPATH = '../data/chartex'

# directory of processed data
OUTPATH = '../data/chartex_processed'

In [3]:
def chartex_to_csv(page_path, out_path):
    '''add spotify data to chartex data and save to csv'''
    page_data = src.file_utils.load_json(page_path)
    assert(page_data)
    tracks = []
    for chartex_track_info in tqdm(page_data['results'], desc=f'tracks'):
        spoitfy_id = chartex_track_info['spotify_platform_id']
        if not spoitfy_id:
            continue
        track_info = spot.get_track_info(spoitfy_id)
        track_info.update(chartex_track_info)
        tracks.append(track_info)
        
    df = pd.DataFrame(tracks)
    df.to_csv(out_path, index=False)

# getting spotify data for all pages
for page in tqdm(range(0), desc='pages'):
    page_path = f'{DATAPATH}/page{page}.json'
    out_path = f'{OUTPATH}/page{page}.csv'
    chartex_to_csv(page_path, out_path)


pages: 0it [00:00, ?it/s]

In [6]:
#combining all csv files into one
merged_path = '../data/chartex_merged.csv'
src.file_utils.merge_csv(OUTPATH, merged_path)

In [7]:
# remove unnecessary columns 
clean_path = "../data/chartex_clean.csv"
to_drop = ["type", "uri", "track_uri", "track_href", "rank", "youtube_platform_id", 
           "tiktok_platform_id", "itunes_platform_id", "spotify_platform_id" ,
           "itunes_album_platform_id", "song_guid", "analysis_url"]

df = pd.read_csv(merged_path)
df = df.drop(to_drop, axis=1)

# sort by number of videos column descending
df = df.sort_values(by="number_of_videos", ascending=False)

# save to csv
df.to_csv(clean_path, index=False)