In [1]:
import pandas as pd

In [2]:
data_by_artist = pd.read_csv("data/data/data_by_artist.csv")
data_by_genre = pd.read_csv("data/data/data_by_genres.csv")
data_by_year = pd.read_csv("data/data/data_by_year.csv")
data_with_genres = pd.read_csv("data/data/data_w_genres.csv")
data = pd.read_csv("data/data/data.csv")

In [3]:
print(data_by_artist.head())
print(data_by_genre.head())
print(data_by_year.head())
print(data_with_genres.head())
print(data.head())

   mode  count  acousticness  \
0     1      9      0.590111   
1     1     26      0.862538   
2     1      7      0.856571   
3     1     27      0.884926   
4     1      7      0.510714   

                                             artists  danceability  \
0                   "Cats" 1981 Original London Cast      0.467222   
1                          "Cats" 1983 Broadway Cast      0.441731   
2        "Fiddler On The Roof” Motion Picture Chorus      0.348286   
3     "Fiddler On The Roof” Motion Picture Orchestra      0.425074   
4  "Joseph And The Amazing Technicolor Dreamcoat"...      0.467143   

     duration_ms    energy  instrumentalness  liveness   loudness  \
0  250318.555556  0.394003          0.011400  0.290833 -14.448000   
1  287280.000000  0.406808          0.081158  0.315215 -10.690000   
2  328920.000000  0.286571          0.024593  0.325786 -15.230714   
3  262890.962963  0.245770          0.073587  0.275481 -15.639370   
4  270436.142857  0.488286          0.009

In [5]:
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler

def create_spark_session(app_name="MusicRecommender"):
    spark = SparkSession.builder \
        .appName(app_name) \
        .master("local[*]") \
        .config("spark.sql.shuffle.partitions", "4") \
        .getOrCreate()
    return spark

def load_dataframes(spark, base_path):
    data = spark.read.csv(f"{base_path}/data.csv", header=True, inferSchema=True)
    data_by_artist = spark.read.csv(f"{base_path}/data_by_artist.csv", header=True, inferSchema=True)
    data_by_genre = spark.read.csv(f"{base_path}/data_by_genres.csv", header=True, inferSchema=True)
    data_by_year = spark.read.csv(f"{base_path}/data_by_year.csv", header=True, inferSchema=True)
    data_with_genres = spark.read.csv(f"{base_path}/data_w_genres.csv", header=True, inferSchema=True)

    return data, data_by_artist, data_by_genre, data_by_year, data_with_genres

def preprocess_for_collab(data):
    data = data.withColumn("popularity", col("popularity").cast("float"))

    indexer_user = StringIndexer(inputCol="artists", outputCol="user_id")
    indexer_item = StringIndexer(inputCol="name", outputCol="item_id")

    # Fit and transform to get mappings
    user_indexer_model = indexer_user.fit(data)
    item_indexer_model = indexer_item.fit(data)

    data = user_indexer_model.transform(data)
    data = item_indexer_model.transform(data)

    # Save mappings
    user_id_mapping = data.select("artists", "user_id").distinct()
    item_id_mapping = data.select("name", "item_id").distinct()

    rating_df = data.select("user_id", "item_id", "popularity").dropna()

    return rating_df, user_indexer_model, item_indexer_model, user_id_mapping, item_id_mapping


def preprocess_artist_features(data_by_artist):
    features = [
        "acousticness", "danceability", "duration_ms", "energy",
        "instrumentalness", "liveness", "loudness", "speechiness",
        "tempo", "valence", "popularity"
    ]
    return data_by_artist.select("artists", *features).dropna()

def preprocess_genre_features(data_by_genre):
    features = [
        "acousticness", "danceability", "duration_ms", "energy",
        "instrumentalness", "liveness", "loudness", "speechiness",
        "tempo", "valence", "popularity"
    ]
    return data_by_genre.select("genres", *features).dropna()

def preprocess_year_features(data_by_year):
    features = [
        "acousticness", "danceability", "duration_ms", "energy",
        "instrumentalness", "liveness", "loudness", "speechiness",
        "tempo", "valence", "popularity"
    ]
    return data_by_year.select("year", *features).dropna()

def preprocess_with_genres(data_with_genres):
    features = [
        "acousticness", "danceability", "duration_ms", "energy",
        "instrumentalness", "liveness", "loudness", "speechiness",
        "tempo", "valence", "popularity"
    ]
    return data_with_genres.select("artists", "genres", *features).dropna()

def preprocess_for_deep_learning(df, id_col="artists"):
    features = [
        "acousticness", "danceability", "duration_ms", "energy",
        "instrumentalness", "liveness", "loudness", "speechiness",
        "tempo", "valence"
    ]
    assembler = VectorAssembler(inputCols=features, outputCol="features_raw")
    df = assembler.transform(df)
    scaler = MinMaxScaler(inputCol="features_raw", outputCol="features")
    df = scaler.fit(df).transform(df)
    return df.select(id_col, "features")

def load_all(base_path="data/spotify_dataset"):
    spark = create_spark_session()
    data, data_by_artist, data_by_genre, data_by_year, data_with_genres = load_dataframes(spark, base_path)

    return {
        "collab": preprocess_for_collab(data),
        "artist_content": preprocess_artist_features(data_by_artist),
        "genre_content": preprocess_genre_features(data_by_genre),
        "year_content": preprocess_year_features(data_by_year),
        "with_genres": preprocess_with_genres(data_with_genres),
        "deep_artist": preprocess_for_deep_learning(data_by_artist, id_col="artists"),
        "deep_genre": preprocess_for_deep_learning(data_by_genre, id_col="genres"),
        "raw": data
    }

In [47]:
data = load_all()

# Display collaborative filtering dataset
data["collab"].show()

# Display content-based filtering using artist-level data
data["artist_content"].show()

# Display content-based filtering using genre-level data
data["genre_content"].show()

# Display content-based filtering using year-level data
data["year_content"].show()

# Display data with genres included
data["with_genres"].show()

# Display deep learning data using artist-level features
data["deep_artist"].show()

# Display deep learning data using genre-level features
data["deep_genre"].show()


25/04/20 14:12:15 WARN DAGScheduler: Broadcasting large task binary with size 8.4 MiB


+-------+--------+----------+
|user_id| item_id|popularity|
+-------+--------+----------+
|12942.0|  2805.0|         4|
| 5151.0| 39213.0|         5|
| 1510.0| 54157.0|         5|
|20155.0|   109.0|         3|
| 8529.0|  6382.0|         2|
| 1510.0| 54167.0|         6|
|11382.0|115086.0|         4|
| 6826.0| 13529.0|         2|
|    4.0| 72258.0|         0|
| 3584.0| 65319.0|         0|
|  727.0| 42415.0|         0|
|    4.0| 14756.0|         0|
|20776.0| 72661.0|         0|
| 2239.0| 17873.0|         0|
|33594.0| 93552.0|         0|
| 4829.0|  6869.0|         4|
| 8529.0| 39752.0|         1|
| 2973.0| 15390.0|         0|
| 4861.0|124139.0|         0|
| 5108.0| 23755.0|         0|
+-------+--------+----------+
only showing top 20 rows

+--------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+
|             

In [46]:
data.keys()

dict_keys(['collab', 'artist_content', 'genre_content', 'year_content', 'with_genres', 'deep_artist', 'deep_genre', 'raw'])

In [13]:
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [14]:
data_by_artist.head()

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""Cats"" 1981 Original London Cast",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""Cats"" 1983 Broadway Cast",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,"""Fiddler On The Roof” Motion Picture Chorus",0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,"""Fiddler On The Roof” Motion Picture Orchestra",0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [15]:
data_by_genre.head()

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7


In [16]:
data_by_year.head()

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10
2,1,1923,0.957247,0.577341,177942.362162,0.262406,0.371733,0.227462,-14.129211,0.093949,114.01073,0.625492,5.389189,0
3,1,1924,0.9402,0.549894,191046.707627,0.344347,0.581701,0.235219,-14.231343,0.092089,120.689572,0.663725,0.661017,10
4,1,1925,0.962607,0.573863,184986.92446,0.278594,0.418297,0.237668,-14.146414,0.111918,115.521921,0.621929,2.604317,5


In [17]:
data_with_genres.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5,1,7


In [19]:
data_with_genres.columns

Index(['genres', 'artists', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key', 'mode', 'count'],
      dtype='object')

In [20]:
data.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

In [21]:
data_by_genre.columns

Index(['mode', 'genres', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key'],
      dtype='object')

In [22]:
data.count()

valence             170653
year                170653
acousticness        170653
artists             170653
danceability        170653
duration_ms         170653
energy              170653
explicit            170653
id                  170653
instrumentalness    170653
key                 170653
liveness            170653
loudness            170653
mode                170653
name                170653
popularity          170653
release_date        170653
speechiness         170653
tempo               170653
dtype: int64

In [23]:
data_with_genres.count()

genres              28680
artists             28680
acousticness        28680
danceability        28680
duration_ms         28680
energy              28680
instrumentalness    28680
liveness            28680
loudness            28680
speechiness         28680
tempo               28680
valence             28680
popularity          28680
key                 28680
mode                28680
count               28680
dtype: int64

In [24]:
data_by_genre.count()

mode                2973
genres              2973
acousticness        2973
danceability        2973
duration_ms         2973
energy              2973
instrumentalness    2973
liveness            2973
loudness            2973
speechiness         2973
tempo               2973
valence             2973
popularity          2973
key                 2973
dtype: int64

In [25]:
data_by_artist.count()

mode                28680
count               28680
acousticness        28680
artists             28680
danceability        28680
duration_ms         28680
energy              28680
instrumentalness    28680
liveness            28680
loudness            28680
speechiness         28680
tempo               28680
valence             28680
popularity          28680
key                 28680
dtype: int64

In [34]:
print("columns in data_by_artist:", data_by_artist.columns)
print("columns in data_by_genre:", data_by_genre.columns)
print("columns in data_by_year:", data_by_year.columns)
print("columns in data_with_genres:", data_with_genres.columns)
print("columns in data:", data.columns)

columns in data_by_artist: Index(['mode', 'count', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'popularity', 'key'],
      dtype='object')
columns in data_by_genre: Index(['mode', 'genres', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key'],
      dtype='object')
columns in data_by_year: Index(['mode', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'key'],
      dtype='object')
columns in data_with_genres: Index(['genres', 'artists', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key', 'mode', 'count'],
      dtype

[('user_id', 'double'), ('item_id', 'double'), ('popularity', 'string')]

In [None]:
print("columns in data_by_artist:", data_by_artist.columns)
print("columns in data_by_genre:", data_by_genre.columns)
print("columns in data_by_year:", data_by_year.columns)
print("columns in data_with_genres:", data_with_genres.columns)
print("columns in data:", data.columns)

columns in data_by_artist: Index(['mode', 'count', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'popularity', 'key'],
      dtype='object')
columns in data_by_genre: Index(['mode', 'genres', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key'],
      dtype='object')
columns in data_by_year: Index(['mode', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'key'],
      dtype='object')
columns in data_with_genres: Index(['genres', 'artists', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key', 'mode', 'count'],
      dtype