In [9]:
import sys
sys.path.append('..')

import os
from dotenv import load_dotenv
import pandas as pd
from tqdm.notebook import tqdm

from src.spotify import Spotify
import src.file_utils 

spot = Spotify()

In [4]:
# directory of original data
DATAPATH = '../data/chartex'

# directory of processed data
OUTPATH = '../data/chartex_processed'

In [5]:
def chartex_to_csv(page_path, out_path):
    '''add spotify data to chartex data and save to csv'''
    page_data = src.file_utils.load_json(page_path)
    assert(page_data)
    tracks = []
    for chartex_track_info in tqdm(page_data['results'], desc=f'tracks'):
        spoitfy_id = chartex_track_info['spotify_platform_id']
        if not spoitfy_id:
            continue
        track_info = spot.get_track_info(spoitfy_id)
        track_info.update(chartex_track_info)
        tracks.append(track_info)
        
    df = pd.DataFrame(tracks)
    df.to_csv(out_path, index=False)

# getting spotify data for all pages
for page in tqdm(range(0), desc='pages'):
    page_path = f'{DATAPATH}/page{page}.json'
    out_path = f'{OUTPATH}/page{page}.csv'
    chartex_to_csv(page_path, out_path)


pages: 0it [00:00, ?it/s]

In [7]:
#combining all csv files into one
merged_path = '../data/chartex_merged.csv'
#src.file_utils.merge_csv(OUTPATH, merged_path)
src.file_utils.merge_csv(OUTPATH, merged_path)

In [8]:
# remove unnecessary columns 
clean_path = "../data/chartex_clean.csv"
to_drop = ["type", "uri", "track_uri", "track_href", "rank", "youtube_platform_id", 
           "tiktok_platform_id", "itunes_platform_id", "spotify_platform_id" ,
           "itunes_album_platform_id", "song_guid", "analysis_url"]

df = pd.read_csv(merged_path)
df = df.drop(to_drop, axis=1)

# sort by number of videos column descending
df = df.sort_values(by="number_of_videos", ascending=False)

# save to csv
df.to_csv(clean_path, index=False)