In [None]:
# Imports

import os
from pathlib import Path
import pandas as pd
from datetime import datetime, timedelta

# Setup

In [None]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Setup datasets paths
spotifyChartsPath = os.path.join(datasetsPath, "spotifyCharts.csv")
spotifyReducedChartsPath = os.path.join(datasetsPath, "reducedSpotifyCharts.csv")


## Open data file

In [None]:
# Load Spotify Charts
trackCharts = pd.read_csv(spotifyChartsPath, sep=",", index_col=0)

# Drop NaN columns
trackCharts = trackCharts.dropna()

# Print track charts info
trackCharts.info()


# Sample selector

In [None]:
# onlyFirst: select only first x days of the charts. For example, onlyFirst=50 means take the top 50 songs
# daysRange: select samples every x days. For example, daysRange=7 means select charts every week

def selectSamples(trackCharts, onlyFirst=-1, daysRange=7):
    reducedTrackCharts = pd.DataFrame()

    # First and final date in the csv
    firstDateStr = trackCharts.iloc[-1]["date"]
    endDateStr = trackCharts.iloc[0]["date"]

    # Initialize for the while
    actualDate = datetime.strptime(firstDateStr, "%d/%m/%Y").date()
    endDate = datetime.strptime(endDateStr, "%d/%m/%Y").date()

    while(actualDate < endDate):
        if onlyFirst > 0:
            reducedTrackCharts = pd.concat([reducedTrackCharts, trackCharts.loc[
                (trackCharts['date'] == actualDate.strftime("%d/%m/%Y")) &
                (trackCharts['position'] <= onlyFirst)
            ]], ignore_index=True)
        else:
            reducedTrackCharts = pd.concat(
                [reducedTrackCharts, trackCharts.loc[
                    trackCharts['date'] == actualDate.strftime("%d/%m/%Y")
                ]], ignore_index=True)
    
        actualDate = actualDate + timedelta(days=daysRange)
    
    return reducedTrackCharts


In [None]:
# Reduce the chart tracks
reducedTrackCharts = selectSamples(trackCharts, onlyFirst=100, daysRange=7)

# Print DataFrame info
reducedTrackCharts.info()


In [None]:
# Save the DataFrame to file
reducedTrackCharts.to_csv(spotifyReducedChartsPath)