# Loading Data and Getting Spotify Features

### Setup

In [1]:
import sys
sys.path.append('..')
import os
import pandas as pd
from tqdm.notebook import tqdm
from src.utils.spotify import Spotify
from src.utils.file_utils import merge_csv, create_dirs_if_not_exist, load_json

Initializing Spotify object

In [15]:
spot = Spotify()

Testing spotify api on a single track

In [16]:
test_song = 'https://open.spotify.com/track/7CRPXJt3IVC7V0VcM0PSQi?si=79e0c2a7a20d49eb'

info = spot.get_track_info(test_song)
print(info)

{'track_name': 'Diddy Bop', 'track_pop': 58, 'artist': 'Noname', 'artist_pop': 55, 'album': 'Telefone', 'length': 208000, 'track_uri': 'spotify:track:7CRPXJt3IVC7V0VcM0PSQi', 'danceability': 0.637, 'energy': 0.503, 'key': 1, 'loudness': -6.927, 'mode': 1, 'speechiness': 0.336, 'acousticness': 0.713, 'instrumentalness': 0.000113, 'liveness': 0.109, 'valence': 0.661, 'tempo': 81.061, 'type': 'audio_features', 'id': '7CRPXJt3IVC7V0VcM0PSQi', 'uri': 'spotify:track:7CRPXJt3IVC7V0VcM0PSQi', 'track_href': 'https://api.spotify.com/v1/tracks/7CRPXJt3IVC7V0VcM0PSQi', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7CRPXJt3IVC7V0VcM0PSQi', 'duration_ms': 208000, 'time_signature': 4}


Defining data paths

In [2]:
# directory of original json pages
JSON_PATH = '../data/chartex_json'

# directory of csv pages
CSV_PATH = '../data/chartex_csv'

# filepath of merged raw data
MERGED_PATH = '../data/csv_files/chartex_merged.csv'

# file path of data with only relevant columns
CLEAN_PATH = '../data/csv_files//chartex_clean.csv'

# filepath of data with no duplicate songs
FINAL_PATH = '../data/csv_files//chartex_final.csv'

### Getting metadata from Spotify for all tracks, and saving to csv

In [None]:
def chartex_to_csv(page_path, out_path):
    '''add spotify data to chartex data and save to csv'''

    # ensure out path exists
    create_dirs_if_not_exist(out_path)

    # load page data
    page_data = load_json(page_path)
    assert(page_data)

    # get spotify metadata for each track
    tracks = []
    for chartex_track_info in tqdm(page_data['results'], desc=f'tracks'):
        spoitfy_id = chartex_track_info['spotify_platform_id']
        if not spoitfy_id:
            continue
        track_info = spot.get_track_info(spoitfy_id)
        track_info.update(chartex_track_info)
        tracks.append(track_info)

    # save to csv
    raw_df = pd.DataFrame(tracks)
    raw_df.to_csv(out_path, index=False)

# getting pages
json_files = [file for file in os.listdir(JSON_PATH) if file.endswith('.json')]

# # getting spotify data for all pages
for page, json_file in enumerate(json_files):
    json_path = os.path.join(JSON_PATH, json_file)
    csv_path = os.path.join(CSV_PATH, f'page{page}.csv')
    chartex_to_csv(json_path, csv_path)

# Combining all csv files into one
merge_csv(CSV_PATH, MERGED_PATH)

### Cleaning up the data


In [None]:
# remove unnecessary columns
to_drop = ["type", "uri", "track_uri", "track_href", "rank", "youtube_platform_id",
           "tiktok_platform_id", "itunes_platform_id", "spotify_platform_id" ,
           "itunes_album_platform_id", "song_guid", "analysis_url", "length"]
merged_df = pd.read_csv(MERGED_PATH)
clean_df = merged_df.drop(to_drop, axis=1)

# remove rows with no id
print(f'Total num rows: {clean_df.shape[0]}')
clean_df = clean_df.dropna(subset=['id'])
print(f'Num rows after dropping rows with no id: {clean_df.shape[0]}')

# remove rows with no audio features
clean_df = clean_df.dropna(subset=['valence'])
print(f'Num rows after dropping rows with no audio features: {clean_df.shape[0]}')

# save to csv
clean_df.to_csv(CLEAN_PATH, index=False)

Merging duplicate tracks

In [None]:
# Load the clean CSV file into a DataFrame
clean_df = pd.read_csv(CLEAN_PATH)
print(f'The dataset has {clean_df.shape[0]} rows and {clean_df.shape[1]} columns.')

# Find how many tracks with identical names are in the dataset
duplicates = clean_df.duplicated(subset=['track_name'], keep=False)
num_duplicates = duplicates.sum()
print(f'There are {num_duplicates} duplicate tracks.')

# example of duplicate tracks
indices = clean_df.loc[clean_df['track_name'] == 'Laxed – Siren Beat'].index
example_duplicates = clean_df.loc[indices, ['track_name', 'number_of_videos']]
print(f'\nExample of duplicated tracks:\n{example_duplicates}\n')

# Group the DataFrame by 'track_name' and sum up the 'number_of_videos'
grouped_df = clean_df.groupby('track_name', as_index=False)
grouped_df = grouped_df.agg({col: 'first' if col != 'number_of_videos' else 'sum' for col in clean_df.columns})
grouped_df = grouped_df.reset_index()
print(f'without duplicates the DataFrame has {grouped_df.shape[0]} rows.')

# example of merged track
index = grouped_df.loc[grouped_df['track_name'] == 'Laxed – Siren Beat'].index
example_merged = grouped_df.loc[index, ['track_name', 'number_of_videos']]
print(f'\nExample of merged tracks:\n{example_merged}\n')

# sort by popularity
sorted_df = grouped_df.sort_values(by="number_of_videos", ascending=False)

# save to csv
sorted_df.to_csv(FINAL_PATH, index=False)