# Fujii Kaze's tracks extraction and transformation

Extract track, analysis, feature, and album dataset
Transform and merge each dataset togeter
Clean and transform the final dataset

In [268]:
# import library
import pandas as pd
# call myFunctions.py
%run myFunctions.py
token = get_token()

## Get track data

### Extract [Spotify API]

In [2]:
# call function get_song() to extract track data of the artist via Spotify API
searched_name = "Fujii kaze"
songs = get_song(token, searched_name)

# store the retrieved data in the dataframe
columns = list(songs[0].keys())
rows = []
for row in songs:
    rows.append(list(row.values()))
df = pd.DataFrame(data=rows, columns=columns)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   album          50 non-null     object
 1   artists        50 non-null     object
 2   disc_number    50 non-null     int64 
 3   duration_ms    50 non-null     int64 
 4   explicit       50 non-null     bool  
 5   external_ids   50 non-null     object
 6   external_urls  50 non-null     object
 7   href           50 non-null     object
 8   id             50 non-null     object
 9   is_local       50 non-null     bool  
 10  is_playable    50 non-null     bool  
 11  name           50 non-null     object
 12  popularity     50 non-null     int64 
 13  preview_url    0 non-null      object
 14  track_number   50 non-null     int64 
 15  type           50 non-null     object
 16  uri            50 non-null     object
dtypes: bool(3), int64(4), object(10)
memory usage: 5.7+ KB


In [4]:
# explore each column's values
df.iloc[0]

album            {'album_type': 'single', 'artists': [{'externa...
artists          [{'external_urls': {'spotify': 'https://open.s...
disc_number                                                      1
duration_ms                                                 310997
explicit                                                     False
external_ids                              {'isrc': 'JPPO02400480'}
external_urls    {'spotify': 'https://open.spotify.com/track/4h...
href             https://api.spotify.com/v1/tracks/4hsEFcCA7AL5...
id                                          4hsEFcCA7AL5hlKG4PRp2Z
is_local                                                     False
is_playable                                                   True
name                                   Michi Teyu Ku (Overflowing)
popularity                                                      69
preview_url                                                   None
track_number                                                  

In [5]:
# explore the album object which is another entity
df["album"][0]

{'album_type': 'single',
 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6bDWAcdtVR3WHz2xtiIPUi'},
   'href': 'https://api.spotify.com/v1/artists/6bDWAcdtVR3WHz2xtiIPUi',
   'id': '6bDWAcdtVR3WHz2xtiIPUi',
   'name': 'Fujii Kaze',
   'type': 'artist',
   'uri': 'spotify:artist:6bDWAcdtVR3WHz2xtiIPUi'}],
 'external_urls': {'spotify': 'https://open.spotify.com/album/06qRSgIo9l4Gf7ACV6JZQq'},
 'href': 'https://api.spotify.com/v1/albums/06qRSgIo9l4Gf7ACV6JZQq',
 'id': '06qRSgIo9l4Gf7ACV6JZQq',
 'images': [{'height': 640,
   'url': 'https://i.scdn.co/image/ab67616d0000b27336f72e16774f67f6a4df173d',
   'width': 640},
  {'height': 300,
   'url': 'https://i.scdn.co/image/ab67616d00001e0236f72e16774f67f6a4df173d',
   'width': 300},
  {'height': 64,
   'url': 'https://i.scdn.co/image/ab67616d0000485136f72e16774f67f6a4df173d',
   'width': 64}],
 'is_playable': True,
 'name': 'Michi Teyu Ku (Overflowing)',
 'release_date': '2024-03-14',
 'release_date_precision': 'day',

In [6]:
# explore the artist object which is another entity
df["artists"][0]

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/6bDWAcdtVR3WHz2xtiIPUi'},
  'href': 'https://api.spotify.com/v1/artists/6bDWAcdtVR3WHz2xtiIPUi',
  'id': '6bDWAcdtVR3WHz2xtiIPUi',
  'name': 'Fujii Kaze',
  'type': 'artist',
  'uri': 'spotify:artist:6bDWAcdtVR3WHz2xtiIPUi'}]

### Transform

In [7]:
# copy data from df to track_data, so we don't have to recall API
track_data = df.copy()

In [8]:
# extract only the album_id of each track from the album object
# this album_id will be used to retrieve album data from Spotify API
# we don't used all attributes from album object here since it doesn't contain all attributes we want
albums_id = []
for i in range(len(track_data)):
    albums_id.append(track_data.iloc[i]["album"]["id"])
album_id_series = pd.Series(albums_id , name="album_id")
track_data["album_id"] = album_id_series


In [9]:
# this artist_id will be used to retrieve some data of the artist
artist_id = track_data.iloc[0]["artists"][0]["id"]
artist_id 

'6bDWAcdtVR3WHz2xtiIPUi'

In [10]:
# rename the id and name to track_id and track_name
track_data = track_data.rename(columns={"id": "track_id", "name": "track_name"})

**Selected columns [details of each are available in README.md file]**
- track_id	
- track_name
- href
- popularity	
- uri	
- album_id

In [11]:
# selected columns
track_data = track_data[["track_id", "track_name","href","popularity","uri", "album_id"]]

In [12]:
track_data.head()

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id
0,4hsEFcCA7AL5hlKG4PRp2Z,Michi Teyu Ku (Overflowing),https://api.spotify.com/v1/tracks/4hsEFcCA7AL5...,69,spotify:track:4hsEFcCA7AL5hlKG4PRp2Z,06qRSgIo9l4Gf7ACV6JZQq
1,0o9zmvc5f3EFApU52PPIyW,Shinunoga E-Wa,https://api.spotify.com/v1/tracks/0o9zmvc5f3EF...,78,spotify:track:0o9zmvc5f3EFApU52PPIyW,1OojCidx0eoPKch2M0Kz31
2,6HehZX0GOVt48CK3UEUoTg,まつり,https://api.spotify.com/v1/tracks/6HehZX0GOVt4...,46,spotify:track:6HehZX0GOVt48CK3UEUoTg,1DeciVpwShHj82dFZJiO0N
3,53GYyXuqqC1ZBLBvgXA1QW,Workin' Hard,https://api.spotify.com/v1/tracks/53GYyXuqqC1Z...,45,spotify:track:53GYyXuqqC1ZBLBvgXA1QW,1DeciVpwShHj82dFZJiO0N
4,3ShRJ5ZRMAqHPumvt9m7Cj,何なんw,https://api.spotify.com/v1/tracks/3ShRJ5ZRMAqH...,45,spotify:track:3ShRJ5ZRMAqHPumvt9m7Cj,1DeciVpwShHj82dFZJiO0N


In [13]:
# check duplications
track_data.duplicated().sum()

0

In [14]:
# check duplications in track name
track_data.duplicated(subset="track_name").sum()

4

**Note: We're aware that there are duplicated tracks (e.g. the same track from a single and an album or the same track recorded in different languages) Also, those tracks' popularity are rated independently.
However, this issue will be addressed later after we combine all datasets.**

## Get the track's audio analysis

### Extract [Spotify API]

In [15]:
# call function get_track_analysis() to extract track's audio analysis data
track_analysis = []
for track_id in track_data["track_id"]:
    ta = get_track_analysis(token, track_id)
    track_analysis.append(ta)

columns = list(track_analysis[0].keys())
rows = []
for row in track_analysis:
    rows.append(list(row.values()))

track_analysis_data = pd.DataFrame(data=rows, columns=columns)

In [16]:
track_analysis_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   num_samples                50 non-null     int64  
 1   duration                   50 non-null     float64
 2   sample_md5                 50 non-null     object 
 3   offset_seconds             50 non-null     int64  
 4   window_seconds             50 non-null     int64  
 5   analysis_sample_rate       50 non-null     int64  
 6   analysis_channels          50 non-null     int64  
 7   end_of_fade_in             50 non-null     float64
 8   start_of_fade_out          50 non-null     float64
 9   loudness                   50 non-null     float64
 10  tempo                      50 non-null     float64
 11  tempo_confidence           50 non-null     float64
 12  time_signature             50 non-null     int64  
 13  time_signature_confidence  50 non-null     float64
 

In [17]:
# explore each column's value
track_analysis_data.iloc[0]

num_samples                                                            6857486
duration                                                              310.9971
sample_md5                                                                    
offset_seconds                                                               0
window_seconds                                                               0
analysis_sample_rate                                                     22050
analysis_channels                                                            1
end_of_fade_in                                                         0.45306
start_of_fade_out                                                    306.22476
loudness                                                                 -7.91
tempo                                                                   143.99
tempo_confidence                                                         0.319
time_signature                                      

### Transform

**Selected columns [details of each are available in README.md file]**
- duration	
- loudness	
- tempo	
- time_signature	
- key	
- mode

In [18]:
# selected columns
track_analysis_data = track_analysis_data[["duration", "loudness", "tempo", "time_signature", "key", "mode"]]

In [19]:
track_analysis_data.head()

Unnamed: 0,duration,loudness,tempo,time_signature,key,mode
0,310.9971,-7.91,143.99,4,9,1
1,185.57333,-6.124,158.078,4,6,0
2,225.92,-7.335,97.025,4,1,1
3,239.07098,-6.106,90.0,4,2,0
4,320.64,-7.113,180.065,4,7,1


In [20]:
# check duplications 
track_analysis_data.duplicated().sum()

10

**Note: Again, we're aware that there are duplicated tracks, including their audio analysis.
However, this issue will be addressed later after we combine all datasets.**

In [21]:
# concat analysis data to the track data
track_data_concat = pd.concat([track_data, track_analysis_data] , axis=1)

## Get the track's audio features

### Extract [Spotify API]

In [22]:
# call function get_track_feature() to extract track's feature data
track_feature = []
for track_id in track_data_concat["track_id"]:
    ta = get_track_feature(token, track_id)
    track_feature.append(ta)

# store the retrived data in the dataframe
columns = list(track_feature[0].keys())
rows = []
for row in track_feature:
    rows.append(list(row.values()))

track_feature_data = pd.DataFrame(data=rows, columns=columns)

In [23]:
track_feature_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      50 non-null     float64
 1   energy            50 non-null     float64
 2   key               50 non-null     int64  
 3   loudness          50 non-null     float64
 4   mode              50 non-null     int64  
 5   speechiness       50 non-null     float64
 6   acousticness      50 non-null     float64
 7   instrumentalness  50 non-null     float64
 8   liveness          50 non-null     float64
 9   valence           50 non-null     float64
 10  tempo             50 non-null     float64
 11  type              50 non-null     object 
 12  id                50 non-null     object 
 13  uri               50 non-null     object 
 14  track_href        50 non-null     object 
 15  analysis_url      50 non-null     object 
 16  duration_ms       50 non-null     int64  
 17 

In [24]:
# explore each column's value
track_feature_data.iloc[0]

danceability                                                    0.565
energy                                                          0.591
key                                                                 9
loudness                                                        -7.91
mode                                                                1
speechiness                                                    0.0336
acousticness                                                    0.473
instrumentalness                                               0.0523
liveness                                                        0.107
valence                                                         0.429
tempo                                                          143.99
type                                                   audio_features
id                                             4hsEFcCA7AL5hlKG4PRp2Z
uri                              spotify:track:4hsEFcCA7AL5hlKG4PRp2Z
track_href          

### Transform

In [25]:
# rename the id to track_id
track_feature_data = track_feature_data.rename(columns={"id": "track_id"})

**Selected columns [details of each are available in README.md file]** 
- track_id
- acousticness	
- energy	
- danceability	

In [26]:
# selected columns
track_feature_data = track_feature_data[["track_id", "acousticness", "energy", "danceability"]]

In [27]:
track_feature_data.head()

Unnamed: 0,track_id,acousticness,energy,danceability
0,4hsEFcCA7AL5hlKG4PRp2Z,0.473,0.591,0.565
1,0o9zmvc5f3EFApU52PPIyW,0.166,0.76,0.6
2,6HehZX0GOVt48CK3UEUoTg,0.00892,0.731,0.663
3,53GYyXuqqC1ZBLBvgXA1QW,0.0336,0.664,0.74
4,3ShRJ5ZRMAqHPumvt9m7Cj,0.394,0.729,0.577


In [28]:
# check duplications
track_feature_data.duplicated().sum()

0

In [284]:
# merge track feature data with track and analysis data by track_id
track_data_merge = pd.merge(track_data_concat, track_feature_data, on="track_id")

## Get the track's album

### Extract [Spotify API]

In [285]:
# call function get_track_analysis() to extract track' album data using album_id
albums = []
for track_id in track_data_merge ["album_id"]:
    a = get_album(token, track_id)
    albums.append(a)

columns = list(albums[0].keys())
rows = []
for row in albums:
    rows.append(list(row.values()))

album_data = pd.DataFrame(data=rows, columns=columns)

In [286]:
album_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              50 non-null     object
 1   artists                 50 non-null     object
 2   available_markets       50 non-null     object
 3   copyrights              50 non-null     object
 4   external_ids            50 non-null     object
 5   external_urls           50 non-null     object
 6   genres                  50 non-null     object
 7   href                    50 non-null     object
 8   id                      50 non-null     object
 9   images                  50 non-null     object
 10  label                   50 non-null     object
 11  name                    50 non-null     object
 12  popularity              50 non-null     int64 
 13  release_date            50 non-null     object
 14  release_date_precision  50 non-null     object
 15  total_tr

In [287]:
# explore each column's value
album_data.iloc[0]

album_type                                                           single
artists                   [{'external_urls': {'spotify': 'https://open.s...
available_markets         [AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...
copyrights                [{'text': '© 2024 UNIVERSAL MUSIC LLC', 'type'...
external_ids                                      {'upc': '00602465244298'}
external_urls             {'spotify': 'https://open.spotify.com/album/06...
genres                                                                   []
href                      https://api.spotify.com/v1/albums/06qRSgIo9l4G...
id                                                   06qRSgIo9l4Gf7ACV6JZQq
images                    [{'height': 640, 'url': 'https://i.scdn.co/ima...
label                                                   Universal Music LLC
name                                            Michi Teyu Ku (Overflowing)
popularity                                                               57
release_date

### Transform

In [288]:
# rename the id, name, popularity, uri, and href to album_id, alum_name, album_popularity, album_uri, and albun_href
# prevent the ambiguity between track and album attributes
album_data = album_data.rename(columns={"id": "album_id", "name": "album_name", "popularity": "album_popularity", "uri": "album_uri", "href": "album_href"})

**Selected columns [details of each are available in README.md file]**
- album_id	
- album_name
- album_type
- album_popularity
- release_date	
- total_tracks	
- album_uri
- album_href

In [289]:
album_data = album_data[["album_id", "album_name", "album_type", "album_popularity", "release_date", "total_tracks", "album_uri", "album_href"]]

In [290]:
album_data.head()

Unnamed: 0,album_id,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
0,06qRSgIo9l4Gf7ACV6JZQq,Michi Teyu Ku (Overflowing),single,57,2024-03-14,1,spotify:album:06qRSgIo9l4Gf7ACV6JZQq,https://api.spotify.com/v1/albums/06qRSgIo9l4G...
1,1OojCidx0eoPKch2M0Kz31,HELP EVER HURT NEVER,album,67,2020-05-20,11,spotify:album:1OojCidx0eoPKch2M0Kz31,https://api.spotify.com/v1/albums/1OojCidx0eoP...
2,1DeciVpwShHj82dFZJiO0N,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...
3,1DeciVpwShHj82dFZJiO0N,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...
4,1DeciVpwShHj82dFZJiO0N,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...


In [291]:
# there must be the duplications in album as we retrieved the data by using track_id
# drop the duplications and recheck
album_data = album_data.drop_duplicates(subset=None)
album_data.duplicated().sum()

0

In [292]:
# check duplications in album name
album_data.duplicated(subset="album_name").sum()

1

In [293]:
# find the album name of duplcations
album_data["album_name"][album_data.duplicated(subset="album_name")]

40    Hana
Name: album_name, dtype: object

In [294]:
album_data[album_data["album_name"]=="Hana"]

Unnamed: 0,album_id,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
14,0v6vQ9tMopUTccn0wUvzQD,Hana,single,50,2023-10-12,1,spotify:album:0v6vQ9tMopUTccn0wUvzQD,https://api.spotify.com/v1/albums/0v6vQ9tMopUT...
40,3IGw7uRv2y5lTMlIe8PLeT,Hana,single,34,2023-11-02,4,spotify:album:3IGw7uRv2y5lTMlIe8PLeT,https://api.spotify.com/v1/albums/3IGw7uRv2y5l...


**Note: Both albums named 'Hana' have the album type 'single'. However, one contains 1 track, while the other contains 4 tracks. If we follow the album URI, the one with 4 tracks includes 'Hana', 'Hana - Instrumental', 'Hana - Ballad', and 'Hana - Demo', which are the same song in different versions.
Notably, the album popularities are different (33 vs. 50). This might cause errors during analysis. To prevent any potential issues, we will revisit this after combining with the track data.**

In [328]:
# merge album data with track data by album_id
track_album_data = pd.merge(track_data_merge, album_data, on="album_id")

## Transform final dataset

In [329]:
# inspect the number of tracks retreived from each album
track_album_data.groupby("album_name")["track_id"].count()

album_name
Best of Fujii Kaze 2020-2024     10
HELP EVER HURT COVER              3
HELP EVER HURT NEVER             11
Hana                              2
Kirari Remixes (Asia Edition)     2
LOVE ALL COVER ALL                6
LOVE ALL SERVE ALL               11
Michi Teyu Ku (Overflowing)       1
Workin' Hard                      1
grace                             1
へでもねーよ                            1
まつり                               1
Name: track_id, dtype: int64

### Explore album

In [330]:
track_album_data["album_name"][track_album_data["album_type"] == "album"].unique()

array(['HELP EVER HURT NEVER', 'LOVE ALL SERVE ALL',
       'Kirari Remixes (Asia Edition)', 'LOVE ALL COVER ALL',
       'HELP EVER HURT COVER'], dtype=object)

In [331]:
# check the actual total tracks of each album
# we can see that some album data are incomplete
album_data[["album_name","total_tracks"]][album_data["album_type"] == "album"]

Unnamed: 0,album_name,total_tracks
1,HELP EVER HURT NEVER,11
12,LOVE ALL SERVE ALL,11
13,Kirari Remixes (Asia Edition),9
35,LOVE ALL COVER ALL,11
36,HELP EVER HURT COVER,10


In [332]:
# drop all cover albums (the data retrieved is not complete as well)
track_album_data = track_album_data[track_album_data["album_name"]!="HELP EVER HURT COVER"]
track_album_data = track_album_data[track_album_data["album_name"]!="LOVE ALL COVER ALL"]

In [333]:
# we are aware that the track in album Kirari Remixes (Asia Edition) is incomplete due to the limitation of retrieving data
# however, we are going to use these 2 tracks (the original track and the most popular remix)
track_album_data[track_album_data["album_name"] == "Kirari Remixes (Asia Edition)"]

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id,duration,loudness,tempo,time_signature,...,acousticness,energy,danceability,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
13,51oc6MEsXTpnPn6GOw5VuP,きらり,https://api.spotify.com/v1/tracks/51oc6MEsXTpn...,69,spotify:track:51oc6MEsXTpnPn6GOw5VuP,2OXwORzPU4tm1Skiv6l9KT,231.88295,-6.833,116.982,4,...,0.00803,0.764,0.746,Kirari Remixes (Asia Edition),album,58,2022-01-14,9,spotify:album:2OXwORzPU4tm1Skiv6l9KT,https://api.spotify.com/v1/albums/2OXwORzPU4tm...
41,1zzJfOdraIBJtCAatSYzCt,Kirari - Daul Remix,https://api.spotify.com/v1/tracks/1zzJfOdraIBJ...,50,spotify:track:1zzJfOdraIBJtCAatSYzCt,2OXwORzPU4tm1Skiv6l9KT,245.33043,-6.709,114.99,4,...,0.0511,0.684,0.827,Kirari Remixes (Asia Edition),album,58,2022-01-14,9,spotify:album:2OXwORzPU4tm1Skiv6l9KT,https://api.spotify.com/v1/albums/2OXwORzPU4tm...


In [334]:
# there's also the track named 'Kirari' in English from album LOVE ALL SERVE ALL
track_album_data[track_album_data["track_name"] == 'Kirari']

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id,duration,loudness,tempo,time_signature,...,acousticness,energy,danceability,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
29,7nhXN2H8h3i2PaHPXb6FO3,Kirari,https://api.spotify.com/v1/tracks/7nhXN2H8h3i2...,50,spotify:track:7nhXN2H8h3i2PaHPXb6FO3,7Ip9X7pnkhJ4cwDoBnvneD,227.89333,-6.842,116.975,4,...,0.00704,0.773,0.752,LOVE ALL SERVE ALL,album,61,2022-03-23,11,spotify:album:7Ip9X7pnkhJ4cwDoBnvneD,https://api.spotify.com/v1/albums/7Ip9X7pnkhJ4...


In [335]:
# change the track named きらり to English and add 'Asia Edition' to avoid duplication in track name
track_album_data.loc[track_album_data["track_name"] == "きらり", "track_name"] = "Kirari (Asia Edition)"

### Explore compilation

In [336]:
track_album_data["album_name"][track_album_data["album_type"] == "compilation"].unique()

array(['Best of Fujii Kaze 2020-2024'], dtype=object)

In [337]:
# explore tracks in 'Best of Fujii Kaze 2020-2024' album
track_album_data[track_album_data["album_name"] == "Best of Fujii Kaze 2020-2024"].head()

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id,duration,loudness,tempo,time_signature,...,acousticness,energy,danceability,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
2,6HehZX0GOVt48CK3UEUoTg,まつり,https://api.spotify.com/v1/tracks/6HehZX0GOVt4...,46,spotify:track:6HehZX0GOVt48CK3UEUoTg,1DeciVpwShHj82dFZJiO0N,225.92,-7.335,97.025,4,...,0.00892,0.731,0.663,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...
3,53GYyXuqqC1ZBLBvgXA1QW,Workin' Hard,https://api.spotify.com/v1/tracks/53GYyXuqqC1Z...,45,spotify:track:53GYyXuqqC1ZBLBvgXA1QW,1DeciVpwShHj82dFZJiO0N,239.07098,-6.106,90.0,4,...,0.0336,0.664,0.74,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...
4,3ShRJ5ZRMAqHPumvt9m7Cj,何なんw,https://api.spotify.com/v1/tracks/3ShRJ5ZRMAqH...,45,spotify:track:3ShRJ5ZRMAqHPumvt9m7Cj,1DeciVpwShHj82dFZJiO0N,320.64,-7.113,180.065,4,...,0.394,0.729,0.577,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...
5,0oz4Oh0zx4XRZnS6Z1rr0M,Kirari (Asia Edition),https://api.spotify.com/v1/tracks/0oz4Oh0zx4XR...,44,spotify:track:0oz4Oh0zx4XRZnS6Z1rr0M,1DeciVpwShHj82dFZJiO0N,231.88295,-6.833,116.982,4,...,0.00803,0.764,0.746,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...
6,150mkwkCKc2GVDJ3UcIm0S,満ちてゆく,https://api.spotify.com/v1/tracks/150mkwkCKc2G...,43,spotify:track:150mkwkCKc2GVDJ3UcIm0S,1DeciVpwShHj82dFZJiO0N,310.9971,-7.91,143.99,4,...,0.473,0.591,0.565,Best of Fujii Kaze 2020-2024,compilation,49,2024-05-28,10,spotify:album:1DeciVpwShHj82dFZJiO0N,https://api.spotify.com/v1/albums/1DeciVpwShHj...


In [338]:
# the album "Best of Fujii Kaze 2020-2024" is a compilation that contains top tracks overtime. 
# they are the duplicated version of popular songs
# this would significantly affect the overall analysis, so we have decided to drop this album
track_album_data = track_album_data[track_album_data["album_name"] != "Best of Fujii Kaze 2020-2024"]

### Explore single

In [339]:
track_album_data["album_name"][track_album_data["album_type"] == "single"]

0     Michi Teyu Ku (Overflowing)
14                           Hana
18                   Workin' Hard
21                          grace
28                         へでもねーよ
40                           Hana
42                            まつり
Name: album_name, dtype: object

In [340]:
# revisit issue with Hana album
track_album_data[track_album_data["track_name"].str.contains("Hana")]

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id,duration,loudness,tempo,time_signature,...,acousticness,energy,danceability,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
14,02tNuntKQsoou5T4O8meyh,Hana,https://api.spotify.com/v1/tracks/02tNuntKQsoo...,62,spotify:track:02tNuntKQsoou5T4O8meyh,0v6vQ9tMopUTccn0wUvzQD,246.69986,-8.203,123.031,4,...,0.118,0.716,0.769,Hana,single,50,2023-10-12,1,spotify:album:0v6vQ9tMopUTccn0wUvzQD,https://api.spotify.com/v1/albums/0v6vQ9tMopUT...
40,08ldhcNuoqot1mavblvKfc,Hana - Ballad,https://api.spotify.com/v1/tracks/08ldhcNuoqot...,41,spotify:track:08ldhcNuoqot1mavblvKfc,3IGw7uRv2y5lTMlIe8PLeT,247.55457,-10.588,122.993,4,...,0.887,0.324,0.684,Hana,single,34,2023-11-02,4,spotify:album:3IGw7uRv2y5lTMlIe8PLeT,https://api.spotify.com/v1/albums/3IGw7uRv2y5l...


In [309]:
# the Hana album should contain 4 tracks. However, only 1 track was retrieved, indicating that the data is incomplete. 
# for now, we will keep only the one with single total track for simplicity
track_album_data = track_album_data[track_album_data["track_name"]!="Hana - Ballad"]

In [310]:
# the track まつり is the same song as 'Matsuri' and they have similar analysis and features
track_album_data[track_album_data["track_name"].isin(["Matsuri", "まつり"])]

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id,duration,loudness,tempo,time_signature,...,acousticness,energy,danceability,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
12,7AMGgAPFczs3wJgMqu6Eqi,Matsuri,https://api.spotify.com/v1/tracks/7AMGgAPFczs3...,68,spotify:track:7AMGgAPFczs3wJgMqu6Eqi,7Ip9X7pnkhJ4cwDoBnvneD,225.92,-7.335,97.025,4,...,0.00892,0.731,0.663,LOVE ALL SERVE ALL,album,61,2022-03-23,11,spotify:album:7Ip9X7pnkhJ4cwDoBnvneD,https://api.spotify.com/v1/albums/7Ip9X7pnkhJ4...
42,14QNjDBXGCwEzJZUKkiTem,まつり,https://api.spotify.com/v1/tracks/14QNjDBXGCwE...,51,spotify:track:14QNjDBXGCwEzJZUKkiTem,3Pwex5PyfxbrNTAwl6FGo5,225.92404,-7.307,97.021,4,...,0.00758,0.737,0.661,まつり,single,39,2022-03-20,1,spotify:album:3Pwex5PyfxbrNTAwl6FGo5,https://api.spotify.com/v1/albums/3Pwex5Pyfxbr...


In [311]:
# we drop まつり to avoid duplication and keep track named 'Matsuri' with high popularity and in the album
track_album_data = track_album_data[track_album_data["track_name"] != "まつり"]

In [312]:
# the track へでもねーよ is the same song as 'Hedemo Ne-Yo - LASA edit' but different version
# we keep both as their analysis and features are different
track_album_data[track_album_data["track_name"].isin(["Hedemo Ne-Yo - LASA edit", "へでもねーよ"])]

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id,duration,loudness,tempo,time_signature,...,acousticness,energy,danceability,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
19,4Lrz9Fg8L1yncuCjjKjD4D,Hedemo Ne-Yo - LASA edit,https://api.spotify.com/v1/tracks/4Lrz9Fg8L1yn...,53,spotify:track:4Lrz9Fg8L1yncuCjjKjD4D,7Ip9X7pnkhJ4cwDoBnvneD,188.94667,-5.899,175.966,4,...,0.0615,0.861,0.506,LOVE ALL SERVE ALL,album,61,2022-03-23,11,spotify:album:7Ip9X7pnkhJ4cwDoBnvneD,https://api.spotify.com/v1/albums/7Ip9X7pnkhJ4...
28,1LjZnfncDtrn5YtbU4lQMf,へでもねーよ,https://api.spotify.com/v1/tracks/1LjZnfncDtrn...,50,spotify:track:1LjZnfncDtrn5YtbU4lQMf,7grlPzIHgZINOD2GhYBROR,190.75452,-6.206,175.978,4,...,0.0124,0.818,0.468,へでもねーよ,single,38,2020-10-30,1,spotify:album:7grlPzIHgZINOD2GhYBROR,https://api.spotify.com/v1/albums/7grlPzIHgZIN...


In [313]:
# change track name and album name to English
track_album_data.loc[track_album_data["track_name"] == "へでもねーよ", "track_name"] = "Hedemo Ne-Yo"
track_album_data.loc[track_album_data["album_name"] == "へでもねーよ", "album_name"] = "Hedemo Ne-Yo"

In [314]:
# there's no duplcation in track name anymore
track_album_data.duplicated(subset="track_name").sum()

0

In [319]:
track_album_data.head()

Unnamed: 0,track_id,track_name,href,popularity,uri,album_id,duration,loudness,tempo,time_signature,...,acousticness,energy,danceability,album_name,album_type,album_popularity,release_date,total_tracks,album_uri,album_href
0,4hsEFcCA7AL5hlKG4PRp2Z,Michi Teyu Ku (Overflowing),https://api.spotify.com/v1/tracks/4hsEFcCA7AL5...,69,spotify:track:4hsEFcCA7AL5hlKG4PRp2Z,06qRSgIo9l4Gf7ACV6JZQq,310.9971,-7.91,143.99,4,...,0.473,0.591,0.565,Michi Teyu Ku (Overflowing),single,57,2024-03-14,1,spotify:album:06qRSgIo9l4Gf7ACV6JZQq,https://api.spotify.com/v1/albums/06qRSgIo9l4G...
1,0o9zmvc5f3EFApU52PPIyW,Shinunoga E-Wa,https://api.spotify.com/v1/tracks/0o9zmvc5f3EF...,78,spotify:track:0o9zmvc5f3EFApU52PPIyW,1OojCidx0eoPKch2M0Kz31,185.57333,-6.124,158.078,4,...,0.166,0.76,0.6,HELP EVER HURT NEVER,album,67,2020-05-20,11,spotify:album:1OojCidx0eoPKch2M0Kz31,https://api.spotify.com/v1/albums/1OojCidx0eoP...
12,7AMGgAPFczs3wJgMqu6Eqi,Matsuri,https://api.spotify.com/v1/tracks/7AMGgAPFczs3...,68,spotify:track:7AMGgAPFczs3wJgMqu6Eqi,7Ip9X7pnkhJ4cwDoBnvneD,225.92,-7.335,97.025,4,...,0.00892,0.731,0.663,LOVE ALL SERVE ALL,album,61,2022-03-23,11,spotify:album:7Ip9X7pnkhJ4cwDoBnvneD,https://api.spotify.com/v1/albums/7Ip9X7pnkhJ4...
13,51oc6MEsXTpnPn6GOw5VuP,Kirari (Asia Edition),https://api.spotify.com/v1/tracks/51oc6MEsXTpn...,69,spotify:track:51oc6MEsXTpnPn6GOw5VuP,2OXwORzPU4tm1Skiv6l9KT,231.88295,-6.833,116.982,4,...,0.00803,0.764,0.746,Kirari Remixes (Asia Edition),album,58,2022-01-14,9,spotify:album:2OXwORzPU4tm1Skiv6l9KT,https://api.spotify.com/v1/albums/2OXwORzPU4tm...
14,02tNuntKQsoou5T4O8meyh,Hana,https://api.spotify.com/v1/tracks/02tNuntKQsoo...,62,spotify:track:02tNuntKQsoou5T4O8meyh,0v6vQ9tMopUTccn0wUvzQD,246.69986,-8.203,123.031,4,...,0.118,0.716,0.769,Hana,single,50,2023-10-12,1,spotify:album:0v6vQ9tMopUTccn0wUvzQD,https://api.spotify.com/v1/albums/0v6vQ9tMopUT...


In [320]:
track_album_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29 entries, 0 to 41
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          29 non-null     object 
 1   track_name        29 non-null     object 
 2   href              29 non-null     object 
 3   popularity        29 non-null     int64  
 4   uri               29 non-null     object 
 5   album_id          29 non-null     object 
 6   duration          29 non-null     float64
 7   loudness          29 non-null     float64
 8   tempo             29 non-null     float64
 9   time_signature    29 non-null     int64  
 10  key               29 non-null     int64  
 11  mode              29 non-null     int64  
 12  acousticness      29 non-null     float64
 13  energy            29 non-null     float64
 14  danceability      29 non-null     float64
 15  album_name        29 non-null     object 
 16  album_type        29 non-null     object 
 17  albu

### Map data for visualization

In [321]:
track_data_visualize = track_album_data.copy()

In [322]:
# following the definition provided by Spotify 
Mode = {
    0 : "Minor",
    1 : "Major"
}

Key = {
    -1: "No Key",
    0: "C",
    1: "C#/Db",
    2: "D",
    3: "D#/Eb",
    4: "E",
    5: "F",
    6: "F#/Gb",
    7: "G",
    8: "G#/Ab",
    9: "A",
    10: "A#/Bb",
    11: "B"
}

In [323]:
# map mode and key to defined dictionary
track_data_visualize["mode"] = track_data_visualize["mode"].map(Mode)
track_data_visualize["key"] = track_data_visualize["key"].map(Key)

In [324]:
# map time_signature (i.e. 3 -> 3/4)
track_data_visualize["time_signature"] = track_data_visualize["time_signature"].apply(lambda x: int(x)).astype(str) + "/4"

In [325]:
track_data_visualize[["mode","key","time_signature"]].head()

Unnamed: 0,mode,key,time_signature
0,Major,A,4/4
1,Minor,F#/Gb,4/4
12,Major,C#/Db,4/4
13,Major,D,4/4
14,Minor,F,4/4


In [326]:
# download cleaned data as .csv file
track_data_visualize.to_csv(f"track_data_cleaned.csv")

In [327]:
import csv

with open("track_data_cleaned.csv", newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    data = [row for row in reader]
import json
import os

with open("track_data_cleaned.json", "w") as jsonfile:
    json.dump(data, jsonfile)

## Get artist's data

In [60]:
# call function get_artist() to extract artist data
artist = get_artist(token, artist_id)
artist

{'external_urls': {'spotify': 'https://open.spotify.com/artist/6bDWAcdtVR3WHz2xtiIPUi'},
 'followers': {'href': None, 'total': 3256829},
 'genres': ['japanese teen pop'],
 'href': 'https://api.spotify.com/v1/artists/6bDWAcdtVR3WHz2xtiIPUi',
 'id': '6bDWAcdtVR3WHz2xtiIPUi',
 'images': [{'height': 640,
   'url': 'https://i.scdn.co/image/ab6761610000e5ebc5a3e6e9027505f5cba5fdbc',
   'width': 640},
  {'height': 320,
   'url': 'https://i.scdn.co/image/ab67616100005174c5a3e6e9027505f5cba5fdbc',
   'width': 320},
  {'height': 160,
   'url': 'https://i.scdn.co/image/ab6761610000f178c5a3e6e9027505f5cba5fdbc',
   'width': 160}],
 'name': 'Fujii Kaze',
 'popularity': 71,
 'type': 'artist',
 'uri': 'spotify:artist:6bDWAcdtVR3WHz2xtiIPUi'}

In [61]:
artist_name = artist["name"]
followers = artist["followers"]["total"]
genres = artist["genres"][0]
artist_uri = artist["uri"]
artist_href = artist["href"]

In [62]:
artist_data = pd.DataFrame(data = [[artist_name, followers, genres, artist_uri, artist_href]], columns=["artist_name", "followers", " genres", "artist_uri","artist_href"])

In [63]:
artist_data

Unnamed: 0,artist_name,followers,genres,artist_uri,artist_href
0,Fujii Kaze,3256829,japanese teen pop,spotify:artist:6bDWAcdtVR3WHz2xtiIPUi,https://api.spotify.com/v1/artists/6bDWAcdtVR3...


In [64]:
# download artist data as .csv file
artist_data.to_csv(f"{	artist_name}.csv")