# Data Pipeline


In [1]:
# import libraries
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os

## loading Data and merging

In [2]:
# download songs data directly from google drive
url = "https://drive.google.com/uc?id=1nwgfTeTY4MWJ__vKMjvLYUjIy-h63aYv"
songs = pd.read_csv(url)
songs.head()

Unnamed: 0,track_id,song_id,artist,title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [3]:
# we have some duplicates in song_id (different title/artist versions, we can safely ignore)
songs.drop_duplicates(subset=["song_id"], inplace=True)

In [4]:
# loading interactions
url = "https://drive.google.com/uc?id=1RfSGHmHLnZhsZZXVGskf9dVqw2Mu0eHn"
interactions = pd.read_csv(url)
interactions.head()


Unnamed: 0,user_id,song_id,freq
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1


In [5]:
# merging songs and interactions
df = interactions.merge(songs.drop(columns=["track_id"]), on="song_id")
df.head()


Unnamed: 0,user_id,song_id,freq,artist,title
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1,Dwight Yoakam,You're The One
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1,Cartola,Tive Sim
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1,Miguel Calo,El Cuatrero


In [6]:
# let's see the number of unique users and songs now
print(f"Number of unique users: {df['user_id'].nunique()}")
print(f"Number of unique songs: {df['song_id'].nunique()}")


Number of unique users: 110000
Number of unique songs: 163206


In [7]:
# creating a data directory if it doesn't exist
if not os.path.exists("./data"):
    os.makedirs("./data")


In [8]:
# saving the merged data
df.to_csv("./data/merged_data.csv", index=False)

## Filtering data

In [9]:
# let's get the top 2000 songs
song_user_counts = df.groupby("song_id")["user_id"].nunique()
top_songs = song_user_counts.nlargest(2000).index
df = df[df["song_id"].isin(top_songs)]

# Then filter for users with >= 10 songs from these top songs, that allows us to have more interactions and avoid the cold start problem
df = df[df.groupby("user_id")["song_id"].transform("nunique") >= 10]
df.head()

Unnamed: 0,user_id,song_id,freq,artist,title
234,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOMWTWK12AB01860CD,2,Vampire Weekend,White Sky
235,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOBKRVG12A8C133269,1,Vampire Weekend,I Stand Corrected (Album)
236,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOXFXDH12A8C13326E,1,Vampire Weekend,The Kids Dont Stand A Chance (Album)
237,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOAYETG12A67ADA751,1,MIKA,Grace Kelly
238,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOUFNSM12A58A77715,1,Vampire Weekend,Walcott (Album)


In [10]:
# let's see the number of unique users and songs now
print(f"Number of unique users: {df['user_id'].nunique()}")
print(f"Number of unique songs: {df['song_id'].nunique()}")

Number of unique users: 11212
Number of unique songs: 2000


In [11]:
# saving the filtered data
df.to_csv("./data/filtered_data.csv", index=False)

## Augmenting data with negative interactions

To try to remove as much bias as possible, we will augment the data with negative interactions
We will use popularity weights to sample negative proportional to item popularity, this is not perfect but it's a simple way to do it.


In [12]:
# Adding interaction column as 1 for positive interactions
df["interaction"] = 1
df

Unnamed: 0,user_id,song_id,freq,artist,title,interaction
234,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOMWTWK12AB01860CD,2,Vampire Weekend,White Sky,1
235,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOBKRVG12A8C133269,1,Vampire Weekend,I Stand Corrected (Album),1
236,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOXFXDH12A8C13326E,1,Vampire Weekend,The Kids Dont Stand A Chance (Album),1
237,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOAYETG12A67ADA751,1,MIKA,Grace Kelly,1
238,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOUFNSM12A58A77715,1,Vampire Weekend,Walcott (Album),1
...,...,...,...,...,...,...
1450856,07f2f354a2a4390624e0d04e4a555ef67e98f673,SODPPBT12A8C141D90,9,Kings Of Leon,Closer,1
1450864,07f2f354a2a4390624e0d04e4a555ef67e98f673,SOEMRIK12A8C133722,4,Radiohead,Jigsaw Falling Into Place,1
1450868,07f2f354a2a4390624e0d04e4a555ef67e98f673,SOUSOOB12A8C13371F,6,Radiohead,House Of Cards,1
1450869,07f2f354a2a4390624e0d04e4a555ef67e98f673,SOWCKVR12A8C142411,4,Kings Of Leon,Use Somebody,1


In [13]:
# Calculate song popularity weights
song_popularity = df["song_id"].value_counts()
popularity_weights = song_popularity / song_popularity.sum()
popularity_weights

song_id
SOFRQTD12A81C233C0    0.011206
SOAXGDH12A8C13F8A1    0.009705
SOAUWYT12A81C206F1    0.009539
SONYKOW12AB01849C9    0.009242
SOBONKR12A58A7A7E0    0.009076
                        ...   
SOKQYVG12A58A75445    0.000046
SOKTYZO12A8C13E5D9    0.000040
SOVSGXX12A58A7F991    0.000040
SOLXTPE12A8C13BCE2    0.000033
SOPCCWV12A670205F3    0.000020
Name: count, Length: 2000, dtype: float64

In [14]:
# ratio of negative to positive samples
negative_ratio = 3

# let's build a negative sampling dataset
negative_samples = []
for user in tqdm(df.user_id.unique()):
    # get all songs interacted with by the user
    interacted_songs = set(df[df["user_id"] == user]["song_id"])
    # get all songs not interacted with by the user
    non_interacted_songs = list(set(df.song_id.unique()) - interacted_songs)

    # get popularity weights for non-interacted songs
    non_interacted_weights = popularity_weights[non_interacted_songs]
    non_interacted_weights = (
        non_interacted_weights / non_interacted_weights.sum()
    )  # Renormalize

    # sample negative songs based on popularity
    sampled_songs = np.random.choice(
        non_interacted_songs,
        size=len(interacted_songs) * negative_ratio,
        replace=False,
        p=non_interacted_weights,
    )
    negative_samples.extend([(user, song, 0) for song in sampled_songs])

  0%|          | 0/11212 [00:00<?, ?it/s]

In [15]:
negative_samples_df = pd.DataFrame(
    negative_samples, columns=["user_id", "song_id", "interaction"]
)
negative_samples_df


Unnamed: 0,user_id,song_id,interaction
0,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOUSMXX12AB0185C24,0
1,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOMNGMO12A6702187E,0
2,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOSLQQJ12AB017BDCC,0
3,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOPAYPV12AB017DB0C,0
4,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SONYKOW12AB01849C9,0
...,...,...,...
453487,07f2f354a2a4390624e0d04e4a555ef67e98f673,SOXELPB12A8AE46000,0
453488,07f2f354a2a4390624e0d04e4a555ef67e98f673,SOQWYAQ12A6D4FB9A3,0
453489,07f2f354a2a4390624e0d04e4a555ef67e98f673,SOKTYZO12A8C13E5D9,0
453490,07f2f354a2a4390624e0d04e4a555ef67e98f673,SOXEZLY12A8C137AB0,0


In [16]:
# add artist, title, and year to the dataframe
negative_samples_df = negative_samples_df.merge(
    df[["song_id", "artist", "title"]].drop_duplicates(),
    on="song_id",
    how="left",
)
negative_samples_df.head()

Unnamed: 0,user_id,song_id,interaction,artist,title
0,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOUSMXX12AB0185C24,0,Usher featuring will.i.am,OMG
1,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOMNGMO12A6702187E,0,Gorillaz,DARE
2,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOSLQQJ12AB017BDCC,0,Passion Pit,Sleepyhead
3,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOPAYPV12AB017DB0C,0,Florence + The Machine,You've Got The Love
4,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SONYKOW12AB01849C9,0,OneRepublic,Secrets


In [17]:
# concat the dataframes
df = pd.concat([df, negative_samples_df]).reset_index(drop=True)
df.head()

Unnamed: 0,user_id,song_id,freq,artist,title,interaction
0,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOMWTWK12AB01860CD,2.0,Vampire Weekend,White Sky,1
1,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOBKRVG12A8C133269,1.0,Vampire Weekend,I Stand Corrected (Album),1
2,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOXFXDH12A8C13326E,1.0,Vampire Weekend,The Kids Dont Stand A Chance (Album),1
3,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOAYETG12A67ADA751,1.0,MIKA,Grace Kelly,1
4,05b9035fe02ebdf5535ae9ec07de1e935ffa43df,SOUFNSM12A58A77715,1.0,Vampire Weekend,Walcott (Album),1


In [18]:
# saving the augmented data
df.to_csv("./data/augmented_data.csv", index=False)
