# Preprocessing

Mounting to drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing required packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
from google.colab import files
import datetime
import networkx as nx
import matplotlib.pyplot as plt
import random

In [None]:
# Reading original data
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project/lastfm-dataset-1K.tar/lastfm-dataset-1K/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',sep="\t", on_bad_lines='skip')
# Renaming the columns
data.columns =['userid', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name']
# Dropping rows where track id or track name is null
data['track_name'].isnull().values.any()
data=data.dropna(subset=['track_name'])

data['track_id'].isnull().values.any()
data=data.dropna(subset=['track_id'])
# resetting the index
data.reset_index(drop=True)

In [None]:
# Convert object type to datetime object
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['timestamp'] = pd.to_datetime(data['timestamp'].dt.strftime("%Y-%m-%d %H:%M:%S.%f"))
print(data.dtypes)


Unnamed: 0              int64
userid                 object
timestamp      datetime64[ns]
artist_id              object
artist_name            object
track_id               object
track_name             object
dtype: object


In [None]:
data.shape

(16936134, 7)

Creating Sessions

In [None]:

# Sort the data by user and timestamp to ensure sequential order
data=data.groupby(['userid']).apply(lambda x: x.sort_values(['timestamp'], ascending=True)).reset_index(drop=True)

# Function to create unique session IDs based on time difference
def create_session_ids(data, threshold_time):
    session_ids = []
    prev_user = None
    prev_timestamp = None
    session_id_counter = 0

    for index, row in data.iterrows():
        user_id = row['userid']
        timestamp = pd.to_datetime(row['timestamp'])

        if prev_user is None or user_id != prev_user or (timestamp - prev_timestamp).total_seconds() > threshold_time:
            # Start a new session for a different user or if the time difference exceeds the threshold
            session_id_counter += 1

        session_ids.append(session_id_counter)

        prev_user = user_id
        prev_timestamp = timestamp

    return session_ids

# Set the threshold time in seconds (Here, 360 seconds = 6 minutes)
threshold_time_seconds = 360





In [None]:
# Create unique session IDs based on the time difference
data['session_id'] = create_session_ids(data, threshold_time_seconds)

In [None]:
data.head(-100)

Unnamed: 0,userid,timestamp,artist_id,artist_name,track_id,track_name,session_id
0,user_1,2006-08-13 13:59:20,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,c4633ab1-e715-477f-8685-afa5f2058e42,The Launching Of Big Face,1
1,user_1,2006-08-13 14:03:29,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,bc2765af-208c-44c5-b3b0-cf597a646660,Zn Zero,1
2,user_1,2006-08-13 14:10:43,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,aa9c5a80-5cbe-42aa-a966-eb3cfa37d832,The Return Of Super Barrio - End Credits,2
3,user_1,2006-08-13 14:17:40,67fb65b5-6589-47f0-9371-8a40eb268dfb,Tommy Guerrero,d9b1c1da-7e47-4f97-a135-77260f2f559d,Mission Flats,3
4,user_1,2006-08-13 14:19:06,1cfbc7d1-299c-46e6-ba4c-1facb84ba435,Artful Dodger,120bb01c-03e4-465f-94a0-dce5e9fac711,What You Gonna Do?,3
...,...,...,...,...,...,...,...
16936029,user_999,2009-05-02 09:21:16,b7ffd2af-418f-4be2-bdd1-22f8b48613da,Nine Inch Nails,f68d3318-e0b8-4612-b95b-9710e6ddd322,The Day The World Went Away,3671979
16936030,user_999,2009-05-02 09:26:34,ff4308de-5b17-4869-ad77-ea41d8bf3b18,Henry Homesweet,68fa3259-dbc2-4cb4-9fe1-326bc248f856,Pocket Monster,3671979
16936031,user_999,2009-05-02 09:28:13,b7ffd2af-418f-4be2-bdd1-22f8b48613da,Nine Inch Nails,3fffc4ec-6cf0-4e4c-b1e5-5c988f07bf3e,24 Ghosts Iii,3671979
16936032,user_999,2009-05-02 09:31:17,c27f8fae-e697-4959-a791-babfd0a4ecbf,"Raz, Dwa, Trzy",6a9ab649-5de2-4adf-a798-4fdd30805e08,Czarna Inez,3671979


In [None]:
data.to_csv('/content/drive/MyDrive/Colab Notebooks/Project/clean_data.csv',index=False)

In [None]:
data.shape

(16936134, 7)

In [3]:
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project/clean_data.csv', on_bad_lines='skip',index_col=False)

In [None]:
def create_tagged_document_user(data):
    user_tagged_data = []
    words=[]
    user_id = "user_%d" % (1)
    for idx, row in data.iterrows():
      if row['userid'] == user_id:
        words.append(row['track_name'])
      else:
        user_tagged_data.append(TaggedDocument(words=words, tags=[str(user_id)]))
        words=[]
        words.append(row['track_name'])
        user_id=row['userid']
    return user_tagged_data

user_tagged_data = create_tagged_document_user(data)



In [None]:
#Save the tagged document for future use
with open('/content/drive/MyDrive/Colab Notebooks/Project/user_tagged_documents.pkl', 'wb') as f:
    pickle.dump(user_tagged_data, f)

In [4]:
with open('/content/drive/MyDrive/Colab Notebooks/Project/user_tagged_documents.pkl', 'rb') as f:
    user_tagged_data = pickle.load(f)

In [5]:
train, test = train_test_split(user_tagged_data, test_size=0.6, random_state=42)

In [6]:
def get_user_data(test_data,index):
    next_song=test_data[index][0][-1]
    user=test_data[index][1]
    next_artist=data[data['track_name']==next_song]['artist_name'].tolist()[0]
    session=test_data[index][0][:-1]
    return session,next_song,next_artist,user

Random recommendations

In [7]:
# Function to generate random recommendations
def random_recommendations(session,userid, num_recommendations=10):
  df=pd.DataFrame(session, columns=['values'])
  unique_tracks = df['values'].unique()
  try:
    random_tracks = random.sample(list(unique_tracks), num_recommendations)
    random_artists=[data[data['track_name']==node]['artist_name'].tolist()[0] for node in random_tracks]
    return random_tracks,random_artists
  except ValueError:
    random_tracks = random.choices(list(unique_tracks), k = 10)
    random_artists=[data[data['track_name']==node]['artist_name'].tolist()[0] for node in random_tracks]
    return random_tracks,random_artists
    pass

In [None]:
def hit_rate_random(test_data):
    total_sessions = len(test_data)
    correct_recommendations = 0
    actual_total_sessions=0
    for i in range(0,total_sessions):
      session,actual_next_song,actual_next_artist,user=get_user_data(test_data,i)
      popular_tracks,popular_artists = random_recommendations(session,user)
      print("recommended songs",popular_tracks)
      print("actual song",actual_next_song)
      print("recommended artists",popular_artists)
      print("actual artist",actual_next_artist)
      if actual_next_song in popular_tracks:
        correct_recommendations += 1
      elif actual_next_artist in popular_artists:
        correct_recommendations += 0.5


    hit_rate = correct_recommendations / total_sessions
    return hit_rate

Most popular recommendations

In [8]:
# Function to generate popularity-based recommendations
def popularity_recommendations(session,user_id,num_recommendations=10):
  df=pd.DataFrame(session, columns=['values'])
  item_popularity=df['values'].value_counts()
  popular_tracks = item_popularity.index[:num_recommendations].tolist()
  popular_artists=[data[data['track_name']==node]['artist_name'].tolist()[0] for node in popular_tracks]
  return popular_tracks,popular_artists



In [9]:
def hit_rate_baseline(test_data):
    total_sessions = len(test_data)
    correct_recommendations = 0
    actual_total_sessions=0
    for i in range(0,total_sessions):
      session,actual_next_song,actual_next_artist,user=get_user_data(test_data,i)
      popular_tracks,popular_artists = popularity_recommendations(session,user)
      print("recommended songs",popular_tracks)
      print("actual song",actual_next_song)
      print("recommended artists",popular_artists)
      print("actual artist",actual_next_artist)
      if actual_next_song in popular_tracks:
        correct_recommendations += 1
      elif actual_next_artist in popular_artists:
        correct_recommendations += 0.5


    hit_rate = correct_recommendations / total_sessions
    #print(actual_total_sessions)
    return hit_rate

In [12]:
def MRR_MP(test_data):
    total_sessions = len(test_data)
    sum_reciprocal_rank = 0
    actual_total_sessions=0
    for i in range(0,total_sessions):
      session,actual_next_song,actual_next_artist,user=get_user_data(test_data,i)
      if (session,actual_next_song,actual_next_artist)==(0,0,0):
        continue
      else:
        actual_total_sessions=actual_total_sessions+1
        recommended_songs,recommended_artist = popularity_recommendations(session,user)
        print("recommended songs",recommended_songs)
        print("actual song",actual_next_song)
        print("recommended artists",recommended_artist)
        print("actual artist",actual_next_artist)
        if actual_next_song in recommended_songs:
          rank = recommended_songs.index(actual_next_song)+1 #because we start from 0 index
          print(rank)
          sum_reciprocal_rank += 1.0/rank
          print(sum_reciprocal_rank)


    mrr = sum_reciprocal_rank / actual_total_sessions
    print(actual_total_sessions)
    return mrr

In [13]:
MRR_MP(test[:150])

recommended songs ['What Shall We Die For', 'Up Is Down', 'May It Be', 'Hoist The Colours', 'The Ride Of The Rohirrim', 'The Take Over, The Breaks Over', 'Jack Sparrow', 'Singapore', 'Drink Up Me Hearties', 'Famous Last Words']
actual song You Give Me Something
recommended artists ['Hans Zimmer', 'Hans Zimmer', 'Enya', 'Hans Zimmer', 'Howard Shore', 'Fall Out Boy', 'Hans Zimmer', 'Tom Waits', 'Hans Zimmer', 'Glory Of This']
actual artist Jamiroquai
recommended songs ['Wheelpusher', 'Black Is The Colour', 'Scarborough Fair', "What'S A Girl To Do", 'Let Go', 'Back In Style', 'All I Need', 'We Can Have It', "When Your Mind'S Made Up", 'Stay Tuned']
actual song Lend Me Your Face
recommended artists ['Anni Rossi', 'The Czars', 'Wes Montgomery', 'Bat For Lashes', 'Adema', 'The Cliks', 'Radiohead', 'The Dears', 'Glen Hansard & Markéta Irglová', 'Ambulance Ltd']
actual artist Fight Like Apes
recommended songs ['Eleanor Put Your Boots On', 'Life On Mars?', 'Cherry Chapstick', 'Outsiders', 'This

0.009722222222222222

In [None]:
session,next_song,next_artist,user=get_user_data(test,0)
random_recommendations(session,user)

You Give Me Something
['Flashback (Interlude)', 'Stay With Me', "I'Ve Got All This Ringing In My Ears And None On My Fingers", 'Poetry', 'I Love You', 'Frozen', 'Saving Buckbeak', 'Mountain', 'Animal I Have Become', 'All Star'] ['Danity Kane', 'Spiritualized', 'Fall Out Boy', 'The Rh Factor', 'Unity', 'Celldweller', 'John Williams', 'Remember Remember', 'Three Days Grace', 'Smash Mouth']


In [None]:
hit_rate_baseline(test[:20])

You Give Me Something
recommended songs ['What Shall We Die For', 'Up Is Down', 'May It Be', 'Hoist The Colours', 'The Ride Of The Rohirrim', 'The Take Over, The Breaks Over', 'Jack Sparrow', 'Singapore', 'Drink Up Me Hearties', 'Famous Last Words']
actual song You Give Me Something
recommended artists ['Hans Zimmer', 'Hans Zimmer', 'Enya', 'Hans Zimmer', 'Howard Shore', 'Fall Out Boy', 'Hans Zimmer', 'Tom Waits', 'Hans Zimmer', 'Glory Of This']
actual artist Jamiroquai
Lend Me Your Face
recommended songs ['Wheelpusher', 'Black Is The Colour', 'Scarborough Fair', "What'S A Girl To Do", 'Let Go', 'Back In Style', 'All I Need', 'We Can Have It', "When Your Mind'S Made Up", 'Stay Tuned']
actual song Lend Me Your Face
recommended artists ['Anni Rossi', 'The Czars', 'Wes Montgomery', 'Bat For Lashes', 'Adema', 'The Cliks', 'Radiohead', 'The Dears', 'Glen Hansard & Markéta Irglová', 'Ambulance Ltd']
actual artist Fight Like Apes
I Remember (Original Mix)
recommended songs ['Eleanor Put Your 

0.1

In [None]:
hit_rate_random(test[:20])

You Give Me Something
recommended songs ['The White Rider', 'I Got Money Now', 'Multiple Jacks', 'The Take Over, The Breaks Over', 'Hagrid The Professor', 'Until The End', 'Summer Love', 'Get Out Alive', 'Wild Dances', 'Sexy Ladies']
actual song You Give Me Something
recommended artists ['Howard Shore', 'P!Nk', 'Hans Zimmer', 'Fall Out Boy', 'John Williams', 'Entwine', 'Justin Timberlake', 'Three Days Grace', 'Руслана Лижичко', 'Justin Timberlake']
actual artist Jamiroquai
Lend Me Your Face
recommended songs ['The Mirror', 'Horse Tears', 'One Hit', "Maybe You'Re Gone", 'Saltbreakers', 'Ladies Love Chest Rockwell', 'Do You Know Why?', 'Weekend Wars', 'Melody', 'Saint Tropez Is Not Far']
actual song Lend Me Your Face
recommended artists ['Philip Glass', 'Goldfrapp', 'The Knife', 'Sondre Lerche', 'Laura Veirs', 'Lovage', 'Carmen Mcrae', 'Mgmt', 'Pax217', 'Plastilina Mosh']
actual artist Fight Like Apes
I Remember (Original Mix)
recommended songs ['Spiralling', 'Beautiful That Way', 'Touch

0.0

Testing baseline models

In [None]:
hit_rate_random(test[:150])

recommended songs ['The Worst Pies In London', 'The Howling', 'The Kraken', 'The Forbidden Pool', 'Scars', "Harry'S Wondrous World", 'May It Be', 'Phantom', 'Concerning Hobbits', 'Forgiven']
actual song You Give Me Something
recommended artists ['Stephen Sondheim', 'Within Temptation', 'Squirrel Nut Zippers', 'Howard Shore', 'Papa Roach', 'John Williams', 'Enya', 'Justice', 'Howard Shore', 'Disturbed']
actual artist Jamiroquai
recommended songs ['See You On The Moon!', 'Shankill Butchers', 'Know How', 'Love Me Sweet', 'Where Gravity Is Dead', 'Phony Revolutions', 'This Loneliness', 'Hunted By A Freak', 'Random Firl', 'Tubefed']
actual song Lend Me Your Face
recommended artists ['Great Lake Swimmers', 'The Decemberists', 'Kings Of Convenience', 'Kid Loco', 'Laura Veirs', 'Crooked Fingers', 'El Perro Del Mar', 'Mogwai', 'Late Of The Pier', 'Halou']
actual artist Fight Like Apes
recommended songs ['Not An Addict', 'Pi*R^2', 'Blvd. Des Souvenirs', 'Tired Hippo', 'Bad', 'In My Heart', 'Mayb

0.013333333333333334

In [None]:
hit_rate_baseline(test[:150])

recommended songs ['What Shall We Die For', 'Up Is Down', 'May It Be', 'Hoist The Colours', 'The Ride Of The Rohirrim', 'The Take Over, The Breaks Over', 'Jack Sparrow', 'Singapore', 'Drink Up Me Hearties', 'Famous Last Words']
actual song You Give Me Something
recommended artists ['Hans Zimmer', 'Hans Zimmer', 'Enya', 'Hans Zimmer', 'Howard Shore', 'Fall Out Boy', 'Hans Zimmer', 'Tom Waits', 'Hans Zimmer', 'Glory Of This']
actual artist Jamiroquai
recommended songs ['Wheelpusher', 'Black Is The Colour', 'Scarborough Fair', "What'S A Girl To Do", 'Let Go', 'Back In Style', 'All I Need', 'We Can Have It', "When Your Mind'S Made Up", 'Stay Tuned']
actual song Lend Me Your Face
recommended artists ['Anni Rossi', 'The Czars', 'Wes Montgomery', 'Bat For Lashes', 'Adema', 'The Cliks', 'Radiohead', 'The Dears', 'Glen Hansard & Markéta Irglová', 'Ambulance Ltd']
actual artist Fight Like Apes
recommended songs ['Eleanor Put Your Boots On', 'Life On Mars?', 'Cherry Chapstick', 'Outsiders', 'This

0.03333333333333333