<a href="https://colab.research.google.com/github/p-ai-org/p-music/blob/main/album_metadata_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [132]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GroupShuffleSplit

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

ds = pd.read_csv('merged_features.csv')

# New Section

In [154]:
ds.shape
ds = ds.drop('Unnamed: 0', axis = 1)
ds.columns

Index(['Ranking', 'Album', 'Artist', 'Release Date', 'Genres', 'Descriptors',
       'Average Rating', 'Number of Ratings', 'Number of Reviews',
       'Release Month', 'Release Day', 'Release Year', 'Format', 'Label',
       'Genre', 'Metacritic Critic Score', 'Metacritic Reviews',
       'Metacritic User Score', 'Metacritic User Reviews', 'AOTY Critic Score',
       'AOTY Critic Reviews', 'AOTY User Score', 'AOTY User Reviews'],
      dtype='object')

In [155]:
#use an average of AOTY and metacritic user score (the same goes with critic)
#if one of them is missing, use the other
#if both are missing, drop that row
col_list = ['AOTY Critic Score', 'Metacritic User Score', 'AOTY User Score', 'Metacritic Critic Score', 'Metacritic User Reviews', 'AOTY User Reviews', 'AOTY Critic Reviews', 'Metacritic Reviews']
ds[col_list] = ds[col_list].fillna(0)
ds[:5]

Unnamed: 0,Ranking,Album,Artist,Release Date,Genres,Descriptors,Average Rating,Number of Ratings,Number of Reviews,Release Month,...,Label,Genre,Metacritic Critic Score,Metacritic Reviews,Metacritic User Score,Metacritic User Reviews,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews
0,1.0,OK Computer,Radiohead,16 June 1997,"Alternative Rock, Art Rock",melancholic,4.23,70382,1531,June,...,Parlophone,Alternative Rock,0.0,0.0,0.0,0.0,91,12,93,3204
1,2.0,Wish You Were Here,Pink Floyd,12 September 1975,"Progressive Rock, Art Rock",melancholic,4.29,48662,983,September,...,Harvest,Progressive Rock,0.0,0.0,0.0,0.0,100,4,91,1607
2,4.0,Kid A,Radiohead,3 October 2000,"Art Rock, Experimental Rock, Electronic",cold,4.21,58590,734,October,...,Capitol / EMI,Experimental Rock,80.0,24.0,8.9,1129.0,85,13,92,2862
3,5.0,To Pimp a Butterfly,Kendrick Lamar,15 March 2015,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap",political,4.27,44206,379,March,...,Aftermath / Interscope,Hip Hop,96.0,44.0,8.8,3616.0,95,42,93,4530
4,6.0,Loveless,My Bloody Valentine,4 November 1991,"Shoegaze, Noise Pop",noisy,4.24,49887,1223,November,...,Creation,Shoegaze,0.0,0.0,0.0,0.0,94,9,91,1634


In [156]:
critic_score = ['AOTY Critic Score', 'Metacritic Critic Score'] #if we expand dataset then these lists will have more items
user_score = ['AOTY User Score', 'Metacritic User Score']
critic_reviews = ['AOTY Critic Reviews', 'Metacritic Reviews']
user_reviews = ['AOTY User Reviews', 'Metacritic User Reviews']
merge_list = [critic_score, user_score, critic_reviews, user_reviews]

In [171]:
#tried curried function but chain indexing became a problem
#uncurried version below
def scoreMergeUncurried(col_list):
  merged_score = []
  for x in ds.index:
    if ds.loc[x, col_list[0]]==0 and ds.loc[x, col_list[1]]==0 : #both values filled with 0
     merged_score.append(sum(merged_score)/len(merged_score)) #switch to substitution
    elif ds.loc[x, col_list[0]]==0:
      merged_score.append(ds.loc[x, col_list[1]])
    elif ds.loc[x, col_list[1]]==0:
      merged_score.append(ds.loc[x, col_list[0]])
    else: #neither missing -> use mean
      merged_score.append((ds.loc[x, col_list[0]]+ds.loc[x, col_list[0]])/2)
  return merged_score
  
  #df['new_col] = scoreMergeUncurried(col_list)

In [173]:
ds['merged_critic_score'] = scoreMergeUncurried(critic_score)
ds['merged_user_score'] = scoreMergeUncurried(user_score)
ds['merged_critic_reviews'] = scoreMergeUncurried(critic_reviews)
ds['merged_user_reviews'] = scoreMergeUncurried(user_reviews)

#drop originals at once here
ds = ds.drop(col_list, axis=1)

In [180]:
#numberify(?) release year. fill in mean if missing (these aren't highly correlated with the output anyway)

ds['Release Year'] = ds['Release Date'].str.replace("-", "").apply(lambda x: x[-2:])
ds['Release Year'] = [x if x.isnumeric() else np.nan for x in ds['Release Year']]
ds['Release Year'] =  [x+2000 if x<23 and x != np.nan else x+1900 for x in ds['Release Year'].astype('int64')]
ds['Release Year'].isna().any()

#convert number of ratings into number data type
ds['Number of Ratings'] = ds['Number of Ratings'].str.replace(',', '').astype('int64')
#ds['Number of Ratings'].isna().any()
#
#drop release date and month and day (date: redundant with year, month and day have almost no correlation with output)
ds = ds.drop(['Release Date', 'Release Month', 'Release Day'], axis = 1)



In [161]:
#for each row in ds["Genre"]
#if value is missing
#replace with first value in ds["Genres"]

missing_genre = ds.loc[ds["Genre"].isna()]
for x in missing_genre.index:
  ds.loc[x, 'Genre'] = ds.loc[x, 'Genres'][0]

for x in ds.index:
  if "," in ds.loc[x, 'Genre']:
    ds.loc[x, 'Genre'] = ds.loc[x, 'Genre'].split(',')[0]


In [162]:
def catClean(col):
  col = col.fillna("None")
  new_col = col.apply(lambda row: row.split(', ')[0])
  return new_col

In [163]:
ds["Descriptors"] = catClean(ds["Descriptors"])
ds["Label"] = catClean(ds["Label"])

Num_features:  ['Ranking', 'Average Rating', 'Number of Ratings', 'Number of Reviews', 'Release Year', 'merged_critic_score', 'merged_user_score', 'merged_critic_reviews', 'merged_user_reviews']
cat_features:  ['Album', 'Artist', 'Genres', 'Descriptors', 'Format', 'Label', 'Genre']


In [191]:
#data processing: copied from kaggle learn deep learning course
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

X = ds.copy()
y = X.pop('Average Rating')

num_features = ds.select_dtypes(include=np.number).columns.tolist()
#print("Num_features: ",num_features)

cat_features = ds.select_dtypes(include=object).columns.tolist()
#print("cat_features: ",cat_features)

num_transformer = make_pipeline(
    SimpleImputer(strategy="constant"), # there are a few missing values
    StandardScaler(),
)
cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="None"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features),
)

# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y, stratify=y, train_size=0.85)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]]

ValueError: ignored

In [None]:
#model

model = keras.Sequential([
    layers.BatchNormalization(input_shape = input_shape),
    layers.Dense(256, activation = "relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation = "relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation = "sigmoid")
]
)


In [None]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ['binary_accuracy'])

In [164]:
#one-hot encoding for entries with multiple items
#abandoned this idea because it was more complicated than I expected and didn't seem to be worth extra processing pwoerr
#go for the simpler solution if I have two different solution
# 1 if item is in each row entry, 0 if not

#create list of top 20 items by frequency


#if entry has multiple words, split it into list
def wordSplit(row):
  if "," in row:
    row = row.split(', ')

#input unprocessed col
def makeList(col):
  item_counts =col.value_counts()
  #print(item_counts[:5])
  top_list = item_counts.index[:20].tolist()
  return top_list



#input processed col here
def makeDf(col):
  #dict will go into new df
  item_dict = {}
  for item in top_list:
    item_dict[item] = []
  for item in top_list: #for each item in top_list, append 1 to entry if item is in the row
    item_dict[item] = [1 if (item in set(row)) else 0 for row in set(col)]
    print(item, "+" , item_dict[item])
  item_df = pd.DataFrame.from_dict(item_dict)
  return item_df