In [1]:
# Imports

import os
import time
from pathlib import Path
import pandas as pd
import json
import requests
import base64
from datetime import datetime, timedelta

# Setup

In [2]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = absPath + "/datasets"

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Setup datasets paths
genresPath = datasetsPath + "/genres.csv"
tracksPath = datasetsPath + "/tracks.csv"
albumsPath = datasetsPath + "/albums.csv"
artistsPath = datasetsPath + "/artists.csv"
peoplePath = datasetsPath + "/people.csv"

## Get tracks

In [3]:
# Load Spotify Charts
trackCharts = pd.read_csv(os.path.join(datasetsPath, "spotifyCharts.csv"), sep=",")

# Drop NaN columns
trackCharts = trackCharts.dropna()

# Print track charts info
trackCharts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9791359 entries, 0 to 9806999
Data columns (total 8 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   country     object 
 2   date        object 
 3   position    float64
 4   uri         object 
 5   track       object 
 6   title       object 
 7   artist      object 
dtypes: float64(1), int64(1), object(6)
memory usage: 672.3+ MB


In [4]:
# Get only uris
spotifyTrackLinks = pd.DataFrame(trackCharts["uri"].drop_duplicates())

# Print tracks uris info
spotifyTrackLinks.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 65459 entries, 0 to 9806983
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uri     65459 non-null  object
dtypes: object(1)
memory usage: 1022.8+ KB


In [5]:
trackCharts["date"].drop_duplicates()

0         05/11/2020
200       04/11/2020
400       03/11/2020
600       02/11/2020
800       01/11/2020
             ...    
279400    04/01/2017
279600    03/01/2017
279800    02/01/2017
280000    01/01/2017
456200    09/06/2018
Name: date, Length: 1401, dtype: object

# Un sample a Settimana

In [6]:
reducedTrackCharts = pd.DataFrame()

#Starting and final date in the csv
firstDateStr = '01/01/2017'
endDateStr = '05/11/2020'

#Initialize for the while
actualDateStr = firstDateStr
actualDateObj = datetime.strptime(actualDateStr, "%d/%m/%Y").date()
endDateObj = datetime.strptime(endDateStr, "%d/%m/%Y").date()

while(actualDateObj < endDateObj):
  
    reducedTrackCharts = pd.concat([reducedTrackCharts, trackCharts.loc[trackCharts['date'] == actualDateStr]])

    print(reducedTrackCharts)
   
    actualDateObj = actualDateObj + timedelta(days=7)
    actualDateStr = actualDateObj.strftime("%d/%m/%Y")


         Unnamed: 0 country        date  position  \
280000       280000  Global  01/01/2017       1.0   
280001       280001  Global  01/01/2017       2.0   
280002       280002  Global  01/01/2017       3.0   
280003       280003  Global  01/01/2017       4.0   
280004       280004  Global  01/01/2017       5.0   
...             ...     ...         ...       ...   
9806995     9806995  Taiwan  01/01/2017     196.0   
9806996     9806996  Taiwan  01/01/2017     197.0   
9806997     9806997  Taiwan  01/01/2017     198.0   
9806998     9806998  Taiwan  01/01/2017     199.0   
9806999     9806999  Taiwan  01/01/2017     200.0   

                                                       uri  \
280000   https://open.spotify.com/track/5aAx2yezTd8zXrk...   
280001   https://open.spotify.com/track/7BKLCZ1jbUBVqRi...   
280002   https://open.spotify.com/track/4pdPtRcBmOSQDlJ...   
280003   https://open.spotify.com/track/5knuzwU65gJK7IF...   
280004   https://open.spotify.com/track/1xznGGDReH1oQ

In [7]:
# Get only uris
spotifyTrackLinks = pd.DataFrame(reducedTrackCharts["uri"].drop_duplicates())

# Print tracks uris info
spotifyTrackLinks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45015 entries, 280000 to 9527779
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uri     45015 non-null  object
dtypes: object(1)
memory usage: 703.4+ KB


# Un solo sample al mese

In [8]:
reducedTrackCharts = pd.DataFrame()

#Starting and final date in the csv
firstDateStr = '01/01/2017'
endDateStr = '05/11/2020'

#Initialize for the while
actualDateStr = firstDateStr
actualDateObj = datetime.strptime(actualDateStr, "%d/%m/%Y").date()
endDateObj = datetime.strptime(endDateStr, "%d/%m/%Y").date()

while(actualDateObj < endDateObj):
  
    reducedTrackCharts = pd.concat([reducedTrackCharts, trackCharts.loc[trackCharts['date'] == actualDateStr]])

    print(reducedTrackCharts)
   
    actualDateObj = actualDateObj + timedelta(days=30)
    actualDateStr = actualDateObj.strftime("%d/%m/%Y")


         Unnamed: 0 country        date  position  \
280000       280000  Global  01/01/2017       1.0   
280001       280001  Global  01/01/2017       2.0   
280002       280002  Global  01/01/2017       3.0   
280003       280003  Global  01/01/2017       4.0   
280004       280004  Global  01/01/2017       5.0   
...             ...     ...         ...       ...   
9806995     9806995  Taiwan  01/01/2017     196.0   
9806996     9806996  Taiwan  01/01/2017     197.0   
9806997     9806997  Taiwan  01/01/2017     198.0   
9806998     9806998  Taiwan  01/01/2017     199.0   
9806999     9806999  Taiwan  01/01/2017     200.0   

                                                       uri  \
280000   https://open.spotify.com/track/5aAx2yezTd8zXrk...   
280001   https://open.spotify.com/track/7BKLCZ1jbUBVqRi...   
280002   https://open.spotify.com/track/4pdPtRcBmOSQDlJ...   
280003   https://open.spotify.com/track/5knuzwU65gJK7IF...   
280004   https://open.spotify.com/track/1xznGGDReH1oQ

In [9]:
reducedTrackCharts.info()

# Get only uris
spotifyTrackLinks = pd.DataFrame(reducedTrackCharts["uri"].drop_duplicates())

# Print tracks uris info
spotifyTrackLinks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 321166 entries, 280000 to 9531799
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  321166 non-null  int64  
 1   country     321166 non-null  object 
 2   date        321166 non-null  object 
 3   position    321166 non-null  float64
 4   uri         321166 non-null  object 
 5   track       321166 non-null  object 
 6   title       321166 non-null  object 
 7   artist      321166 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 22.1+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30205 entries, 280000 to 9531796
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uri     30205 non-null  object
dtypes: object(1)
memory usage: 472.0+ KB


# Un sample a settimana ma solo top 50

In [10]:
reducedTrackCharts = pd.DataFrame()

#Starting and final date in the csv
firstDateStr = '01/01/2017'
endDateStr = '05/11/2020'

#Initialize for the while
actualDateStr = firstDateStr
actualDateObj = datetime.strptime(actualDateStr, "%d/%m/%Y").date()
endDateObj = datetime.strptime(endDateStr, "%d/%m/%Y").date()

while(actualDateObj < endDateObj):
  
    reducedTrackCharts = pd.concat([reducedTrackCharts, trackCharts.loc[(trackCharts['date'] == actualDateStr) & (trackCharts['position']<= 50)]])

    print(reducedTrackCharts)
   
    actualDateObj = actualDateObj + timedelta(days=7)
    
    actualDateStr = actualDateObj.strftime("%d/%m/%Y")

         Unnamed: 0 country        date  position  \
280000       280000  Global  01/01/2017       1.0   
280001       280001  Global  01/01/2017       2.0   
280002       280002  Global  01/01/2017       3.0   
280003       280003  Global  01/01/2017       4.0   
280004       280004  Global  01/01/2017       5.0   
...             ...     ...         ...       ...   
9806845     9806845  Taiwan  01/01/2017      46.0   
9806846     9806846  Taiwan  01/01/2017      47.0   
9806847     9806847  Taiwan  01/01/2017      48.0   
9806848     9806848  Taiwan  01/01/2017      49.0   
9806849     9806849  Taiwan  01/01/2017      50.0   

                                                       uri  \
280000   https://open.spotify.com/track/5aAx2yezTd8zXrk...   
280001   https://open.spotify.com/track/7BKLCZ1jbUBVqRi...   
280002   https://open.spotify.com/track/4pdPtRcBmOSQDlJ...   
280003   https://open.spotify.com/track/5knuzwU65gJK7IF...   
280004   https://open.spotify.com/track/1xznGGDReH1oQ

In [11]:
reducedTrackCharts.info()

# Get only uris
spotifyTrackLinks = pd.DataFrame(reducedTrackCharts["uri"].drop_duplicates())

# Print tracks uris info
spotifyTrackLinks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351091 entries, 280000 to 9527649
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  351091 non-null  int64  
 1   country     351091 non-null  object 
 2   date        351091 non-null  object 
 3   position    351091 non-null  float64
 4   uri         351091 non-null  object 
 5   track       351091 non-null  object 
 6   title       351091 non-null  object 
 7   artist      351091 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 24.1+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15451 entries, 280000 to 9527649
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uri     15451 non-null  object
dtypes: object(1)
memory usage: 241.4+ KB


# Un sample a settimana ma top 100

In [12]:
reducedTrackCharts = pd.DataFrame()

#Starting and final date in the csv
firstDateStr = '01/01/2017'
endDateStr = '05/11/2020'

#Initialize for the while
actualDateStr = firstDateStr
actualDateObj = datetime.strptime(actualDateStr, "%d/%m/%Y").date()
endDateObj = datetime.strptime(endDateStr, "%d/%m/%Y").date()

while(actualDateObj < endDateObj):
  
    reducedTrackCharts = pd.concat([reducedTrackCharts, trackCharts.loc[(trackCharts['date'] == actualDateStr) & (trackCharts['position']<= 100)]])

    print(reducedTrackCharts)
   
    actualDateObj = actualDateObj + timedelta(days=7)
    
    actualDateStr = actualDateObj.strftime("%d/%m/%Y")

         Unnamed: 0 country        date  position  \
280000       280000  Global  01/01/2017       1.0   
280001       280001  Global  01/01/2017       2.0   
280002       280002  Global  01/01/2017       3.0   
280003       280003  Global  01/01/2017       4.0   
280004       280004  Global  01/01/2017       5.0   
...             ...     ...         ...       ...   
9806895     9806895  Taiwan  01/01/2017      96.0   
9806896     9806896  Taiwan  01/01/2017      97.0   
9806897     9806897  Taiwan  01/01/2017      98.0   
9806898     9806898  Taiwan  01/01/2017      99.0   
9806899     9806899  Taiwan  01/01/2017     100.0   

                                                       uri  \
280000   https://open.spotify.com/track/5aAx2yezTd8zXrk...   
280001   https://open.spotify.com/track/7BKLCZ1jbUBVqRi...   
280002   https://open.spotify.com/track/4pdPtRcBmOSQDlJ...   
280003   https://open.spotify.com/track/5knuzwU65gJK7IF...   
280004   https://open.spotify.com/track/1xznGGDReH1oQ

In [13]:
reducedTrackCharts.info()

# Get only uris
spotifyTrackLinks = pd.DataFrame(reducedTrackCharts["uri"].drop_duplicates())

# Print tracks uris info
spotifyTrackLinks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 702182 entries, 280000 to 9527699
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  702182 non-null  int64  
 1   country     702182 non-null  object 
 2   date        702182 non-null  object 
 3   position    702182 non-null  float64
 4   uri         702182 non-null  object 
 5   track       702182 non-null  object 
 6   title       702182 non-null  object 
 7   artist      702182 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 48.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26809 entries, 280000 to 9527699
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uri     26809 non-null  object
dtypes: object(1)
memory usage: 418.9+ KB


In [14]:
reducedTrackCharts.info()

reducedTrackChartsPath = datasetsPath + "/reducedSpotifyCharts.csv"

reducedTrackCharts.to_csv(reducedTrackChartsPath)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 702182 entries, 280000 to 9527699
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  702182 non-null  int64  
 1   country     702182 non-null  object 
 2   date        702182 non-null  object 
 3   position    702182 non-null  float64
 4   uri         702182 non-null  object 
 5   track       702182 non-null  object 
 6   title       702182 non-null  object 
 7   artist      702182 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 48.2+ MB
