# Data Pre-Processing

## Imports

In [None]:
import pandas as pd
import numpy as np

## Dataset Import

In [None]:
# Ignoring the ID Space mapping to the streaming services because we are not using it
# df_track_id_space = pd.read_csv('download/track_id_space.csv', sep=';')
# df_album_id_space = pd.read_csv('download/album_id_space.csv', sep=';')
# df_artist_id_space = pd.read_csv('download/artist_id_space.csv', sep=';')

# Tracks
df_track = pd.read_csv('download/track.csv', sep=';')
df_album_track = pd.read_csv('download/album_track.csv', sep=';')
df_artist_track = pd.read_csv('download/artist_track.csv', sep=';')
df_track_sim = pd.read_csv('download/track_similarity.csv', sep=';')

# Albums and Artists
df_album = pd.read_csv('download/album.csv', sep=';')
df_artist_album = pd.read_csv('download/artist_album.csv', sep=';')
df_artist = pd.read_csv('download/artist.csv', sep=';')

# Ratings
df_lastfm_rating = pd.read_csv('download/lastfm_rating.csv', sep=';')
df_billboard_rating = pd.read_csv('download/billboard_rating.csv', sep=';')
df_spotify_rating = pd.read_csv('download/spotify_rating.csv', sep=';')

## Track

In [None]:
print(df_track.shape)
print(df_album_track.shape)

# Add information about the album to the track
df_track = pd.merge(df_track, df_album_track, on='track_id', how='left')

df_track = df_track.drop(columns=['position']) # Useless to know the position of the track in the album

print(df_track.shape)

df_track.head()

In [None]:
print(df_track.shape)
print(df_album.shape)

# add to track the column artist_id
df_track = pd.merge(df_track, df_artist_track, on='track_id', how='left')

# replace "title" by "name"
df_track = df_track.rename(columns={'title': 'name'})

print(df_track.shape)

df_track.head()

In [None]:
# save to file
df_track.to_csv('data/track.csv', index=False, sep=',')

## Album

In [None]:
print(df_album.shape)
print(df_artist_album.shape)

# Add information about the artist to the album
df_album = pd.merge(df_album, df_artist_album, on='album_id')

print(df_album.shape)

df_album.head()

In [None]:
# save to file
df_album.to_csv('data/album.csv', index=False, sep=',')

## Artist

In [None]:
print(df_artist.shape)

df_artist = df_artist.drop(columns=['image_url'])

df_artist.head()

In [None]:
def get_all_tracks_by_artist(artist_id, only_id=False):
    return df_track[df_track['artist_id'] == artist_id] if not only_id else df_track[df_track['artist_id'] == artist_id]['track_id']

def get_all_albums_by_artist(artist_id, only_id=False):
    return df_album[df_album['artist_id'] == artist_id] if not only_id else df_album[df_album['artist_id'] == artist_id]['album_id']

# Add the new columns "all_tracks" and "all_albums" to the artist as a list of ids
df_artist['all_tracks'] = df_artist['artist_id'].apply(lambda x: get_all_tracks_by_artist(x, True).tolist())
df_artist['all_albums'] = df_artist['artist_id'].apply(lambda x: get_all_albums_by_artist(x, True).tolist())

df_artist.head()

In [None]:
# save to file
df_artist.to_csv('data/artist.csv', index=False, sep=',')

## Similarity

In [None]:
# remove from sim the column source because it does not have information
df_track_sim = df_track_sim.drop(columns=['source', 'id'])

# rename track_id to track_id_1 and similar_track_id to track_id_2
df_track_sim.rename(columns={'track_id': 'track_id_1', 'similar_track_id': 'track_id_2'}, inplace=True)

df_track_sim.head()

In [None]:
# save to file
df_track_sim.to_csv('data/track_similarity.csv', index=False, sep=',')

## Rating

In [None]:
print(df_lastfm_rating.shape)
df_lastfm_rating = df_lastfm_rating.drop(columns=['id'])
df_lastfm_rating['date_lastfm'] = df_lastfm_rating['date_lastfm'].str.replace(' 00:00:00', '')
df_lastfm_rating['date_lastfm'] = pd.to_datetime(df_lastfm_rating['date_lastfm'])

df_lastfm_rating.head()

In [None]:
print(df_billboard_rating.shape)
df_billboard_rating = df_billboard_rating.drop(columns=['id'])
df_billboard_rating['date_billboard'] = df_billboard_rating['date_billboard'].str.replace(' 00:00:00', '')
df_billboard_rating['date_billboard'] = pd.to_datetime(df_billboard_rating['date_billboard'])

df_billboard_rating.head()

In [None]:
print(df_spotify_rating.shape)
df_spotify_rating = df_spotify_rating.drop(columns=['id'])
df_spotify_rating['date_spotify'] = df_spotify_rating['date_spotify'].str.replace(' 00:00:00', '')
df_spotify_rating['date_spotify'] = pd.to_datetime(df_spotify_rating['date_spotify'])

df_spotify_rating.head()

In [None]:
# get all the entries for track_id == 428 from lastfm
print(df_lastfm_rating[df_lastfm_rating['track_id'] == 428].shape)
df_lastfm_rating[df_lastfm_rating['track_id'] == 428]

In [None]:
# get all the entries for track_id == 428 from billboard
print(df_billboard_rating[df_billboard_rating['track_id'] == 428].shape)
df_billboard_rating[df_billboard_rating['track_id'] == 428]

In [None]:
# get all the entries for track_id == 428 from spotify
print(df_spotify_rating[df_spotify_rating['track_id'] == 428].shape)
df_spotify_rating[df_spotify_rating['track_id'] == 428]

## Epoch Feature

In [None]:
df_lastfm_rating['time_epoch'] = df_lastfm_rating['date_lastfm'].astype(np.int64) // 10**9
df_billboard_rating['time_epoch'] = df_billboard_rating['date_billboard'].astype(np.int64) // 10**9
df_spotify_rating['time_epoch'] = df_spotify_rating['date_spotify'].astype(np.int64) // 10**9

In [None]:
# Get smallest and largest date from all ratings
min_date = min(df_lastfm_rating['date_lastfm'].min(), df_billboard_rating['date_billboard'].min(), df_spotify_rating['date_spotify'].min())
max_date = max(df_lastfm_rating['date_lastfm'].max(), df_billboard_rating['date_billboard'].max(), df_spotify_rating['date_spotify'].max())
print(min_date, max_date)

In [None]:
# Get smallest and largest epoch from all ratings
min_epoch = min(df_lastfm_rating['time_epoch'].min(), df_billboard_rating['time_epoch'].min(), df_spotify_rating['time_epoch'].min())
max_epoch = max(df_lastfm_rating['time_epoch'].max(), df_billboard_rating['time_epoch'].max(), df_spotify_rating['time_epoch'].max())

print(min_epoch, max_epoch)

In [None]:
# Convert all epochs to weekly epochs
week = 7 * 24 * 60 * 60
df_lastfm_rating['time_epoch'] = ((((df_lastfm_rating['time_epoch']) // week) * week - 259200)-1072656000) / week
df_billboard_rating['time_epoch'] = ((((df_billboard_rating['time_epoch']) // week) * week - 259200)-1072656000) / week
df_spotify_rating['time_epoch'] = ((((df_spotify_rating['time_epoch']) // week) * week - 259200)-1072656000) / week

# print two epochs to see if it is working
print(df_lastfm_rating['time_epoch'].head(2))
print(df_billboard_rating['time_epoch'].head(2))

In [None]:
# Get unique track_ids and time_epochs from the ratings
unique_track_ids = pd.concat([df_lastfm_rating['track_id'], df_billboard_rating['track_id'], df_spotify_rating['track_id']]).unique()
unique_time_epochs = pd.concat([df_lastfm_rating['time_epoch'], df_billboard_rating['time_epoch'], df_spotify_rating['time_epoch']]).unique()

# Create a MultiIndex from the product of unique track_ids and time_epochs, sort by time_epoch
multi_index = pd.MultiIndex.from_product([unique_track_ids, unique_time_epochs], names=['track_id', 'time_epoch']).sort_values()
# Create a dataframe from the MultiIndex
df_combinations = pd.DataFrame(index=multi_index).reset_index()

print(df_combinations.shape)
df_combinations.head()

In [None]:
# Merge df_combinations with each rating DataFrame
df_combined = df_combinations.merge(df_lastfm_rating[['track_id', 'time_epoch', 'position_lastfm', 'no_of_listeners_lastfm']],
                                    on=['track_id', 'time_epoch'], how='left')

df_combined = df_combined.merge(df_spotify_rating[['track_id', 'time_epoch', 'position_spotify', 'no_of_listeners_spotify']],
                                on=['track_id', 'time_epoch'], how='left')

df_combined = df_combined.merge(df_billboard_rating[['track_id', 'time_epoch', 'position_billboard']],
                                on=['track_id', 'time_epoch'], how='left')
print(df_combined.shape)
# Rename if necessary or just work with the merged DataFrame
df_combined = df_combined.dropna(subset=['position_lastfm', 'position_spotify', 'position_billboard'], how='all')

print(df_combined.shape)
df_combined.head()


In [None]:
# save to file
df_combined.to_csv('data/weekly_rating.csv', index=False, sep=',')

In [None]:
# Collect all unique dates
start_date = pd.to_datetime("2004-01-01")
end_date = pd.to_datetime("2015-02-01")
all_dates = pd.date_range(start=start_date, end=end_date)

print('Number of unique dates:', len(all_dates))
print('First date:', min(all_dates))
print('Last date:', max(all_dates))

# for each date, add a row for each track__id
ids = df_track['track_id'].unique() # unique track ids
dates = all_dates # unique dates
print('Number of unique track ids:', len(ids))

# Create a unified date DataFrame for all dates and all track ids combinations
date_df = pd.DataFrame()
date_df['track_id'] = np.repeat(ids, len(dates))
date_df['date'] = np.tile(dates, len(ids))
date_df['date'] = pd.to_datetime(date_df['date'])
print(date_df.shape)
date_df.head()


In [None]:
def align_service_data(service_df, date_df, date_col):
    service_df[date_col] = pd.to_datetime(service_df[date_col])
    service_df = service_df.dropna(subset=[date_col])
    service_df = service_df.rename(columns={date_col: 'date'})
    service_df = service_df.sort_values(by='date').reset_index(drop=True)
    return pd.merge(date_df, service_df, on=['date', 'track_id'], how='left')

date_df = align_service_data(df_lastfm_rating, date_df, 'date_lastfm')
date_df = align_service_data(df_spotify_rating, date_df, 'date_spotify')
date_df = align_service_data(df_billboard_rating, date_df, 'date_billboard')
print(date_df.shape)

# if a row has nan in all 3 position_lastfm, position_spotify and position_billboard, then drop it
date_df = date_df.dropna(subset=['position_lastfm', 'position_spotify', 'position_billboard'], how='all')
print(date_df.shape)

date_df.head()

In [None]:
# save to file
date_df.to_csv('data/rating.csv', index=False, sep=',')