In [1]:
import numpy as np
import pandas as pd
import gzip
from functools import reduce
from operator import iconcat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
#Here I used gzip library to get the dataset from the gzipped JSON file
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df_orig = getDF('meta_Digital_Music.json.gz')

In [3]:
#Then I took only the relevant columns
df = df_orig[["title", "categories", "description"]]

In [4]:
#I defined a function to get the list of genres of a song so we can use it for recommendations just in case there's no description
#I excluded the categories CD's & Vinyl and Digital Music because they're not very descriptive
def get_genres(categories):
    categories_to_exclude = ["CDs & Vinyl", "Digital Music"]
    categories_flatten = list(set(filter(lambda category: category not in categories_to_exclude,reduce(iconcat, categories, []))))
    return reduce(lambda acc, cur: acc + cur + ", ", categories_flatten, "")[:-2]

In [5]:
#I added the genres to the rows where there's no description
for index, row in df.iterrows():
    if(str(row["description"]) == "nan" or row["description"] == ""):
        row["description"] = get_genres(row["categories"])

In [6]:
#Now we can drop the categories column as well as rows with no title
df = df[["title", "description"]]
df = df[df['title'] != '']
df = df[~df['title'].isna()]

In [7]:
#I used an instance of TfidfVectorizer to get the TF-IDF matrix of the song descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [8]:
tfidf_matrix.shape

(7321, 32848)

In [9]:
#I then used linear_kernel to get the similarity matrix of our TF-IDF matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#Unfortunately this requires almost 600GB of RAM, so I can't use it