<a href="https://colab.research.google.com/github/p-ai-org/p-music/blob/main/album_metadata_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GroupShuffleSplit

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

ds = pd.read_csv('merged_features.csv')


# New Section

In [4]:
ds.shape
ds = ds.drop('Unnamed: 0', axis = 1)
ds.columns


Index(['Ranking', 'Album', 'Artist', 'Release Date', 'Genres', 'Descriptors',
       'Average Rating', 'Number of Ratings', 'Number of Reviews',
       'Release Month', 'Release Day', 'Release Year', 'Format', 'Label',
       'Genre', 'Metacritic Critic Score', 'Metacritic Reviews',
       'Metacritic User Score', 'Metacritic User Reviews', 'AOTY Critic Score',
       'AOTY Critic Reviews', 'AOTY User Score', 'AOTY User Reviews'],
      dtype='object')

In [6]:
#use an average of AOTY and metacritic user score (the same goes with critic)
#if one of them is missing, use the other
#if both are missing, drop that row
col_list = ['AOTY Critic Score', 'Metacritic User Score', 'AOTY User Score', 'Metacritic Critic Score', 'Metacritic User Reviews', 'AOTY User Reviews', 'AOTY Critic Reviews', 'Metacritic Reviews']
ds[col_list] = ds[col_list].fillna(0)
ds[:5]

Unnamed: 0,Ranking,Album,Artist,Release Date,Genres,Descriptors,Average Rating,Number of Ratings,Number of Reviews,Release Month,...,Label,Genre,Metacritic Critic Score,Metacritic Reviews,Metacritic User Score,Metacritic User Reviews,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews
0,1.0,OK Computer,Radiohead,16 June 1997,"Alternative Rock, Art Rock","melancholic, anxious, futuristic, alienation, ...",4.23,70382,1531,June,...,"Parlophone, Capitol",Alternative Rock,0.0,0.0,0.0,0.0,91,12,93,3204
1,2.0,Wish You Were Here,Pink Floyd,12 September 1975,"Progressive Rock, Art Rock","melancholic, atmospheric, progressive, male vo...",4.29,48662,983,September,...,Harvest,Progressive Rock,0.0,0.0,0.0,0.0,100,4,91,1607
2,4.0,Kid A,Radiohead,3 October 2000,"Art Rock, Experimental Rock, Electronic","cold, melancholic, futuristic, atmospheric, an...",4.21,58590,734,October,...,Capitol / EMI,Experimental Rock,80.0,24.0,8.9,1129.0,85,13,92,2862
3,5.0,To Pimp a Butterfly,Kendrick Lamar,15 March 2015,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","political, conscious, poetic, protest, concept...",4.27,44206,379,March,...,Aftermath / Interscope,Hip Hop,96.0,44.0,8.8,3616.0,95,42,93,4530
4,6.0,Loveless,My Bloody Valentine,4 November 1991,"Shoegaze, Noise Pop","noisy, ethereal, atmospheric, romantic, dense,...",4.24,49887,1223,November,...,Creation,Shoegaze,0.0,0.0,0.0,0.0,94,9,91,1634


In [7]:
critic_score = ['AOTY Critic Score', 'Metacritic Critic Score'] #if we expand dataset then these lists will have more items
user_score = ['AOTY User Score', 'Metacritic User Score']
critic_reviews = ['AOTY Critic Reviews', 'Metacritic Reviews']
user_reviews = ['AOTY User Reviews', 'Metacritic User Reviews']
merge_list = [critic_score, user_score, critic_reviews, user_reviews]

In [9]:
#tried curried function but chain indexing became a problem
def scoreMergeUncurried(col_list):
  merged_score = []
  for x in ds.index:
    if ds.loc[x, col_list[0]]==0 and ds.loc[x, col_list[1]]==0 : #both values filled with 0
     merged_score.append(sum(merged_score)/len(merged_score)) #switch to substitution
    elif ds.loc[x, col_list[0]]==0:
      merged_score.append(ds.loc[x, col_list[1]])
    elif ds.loc[x, col_list[1]]==0:
      merged_score.append(ds.loc[x, col_list[0]])
    else: #neither missing -> use mean
      merged_score.append((ds.loc[x, col_list[0]]+ds.loc[x, col_list[0]])/2)
  ds.drop(col_list, axis = 1)
  return merged_score
  
  #df['new_col] = scoreMergeUncurried(col_list)

In [10]:
ds['merged_critic_score'] = scoreMergeUncurried(critic_score)
ds['merged_user_score'] = scoreMergeUncurried(user_score)
ds['merged_critic_reviews'] = scoreMergeUncurried(critic_reviews)
ds['merged_user_reviews'] = scoreMergeUncurried(user_reviews)

In [11]:
#numberify(?) release year. fill in mean if missing (these aren't highly correlated with the output anyway)

ds['Release Year'] = ds['Release Date'].str.replace("-", "").apply(lambda x: x[-2:])
ds['Release Year'] = [x if x.isnumeric() else np.nan for x in ds['Release Year']]
ds['Release Year'] =  [x+2000 if x<23 and x != np.nan else x+1900 for x in ds['Release Year'].astype('int64')]
ds['Release Year']
ds.columns
#drop release date and month
ds.drop(['Release Date', 'Release Month'], axis = 1)

Unnamed: 0,Ranking,Album,Artist,Genres,Descriptors,Average Rating,Number of Ratings,Number of Reviews,Release Day,Release Year,...,Metacritic User Score,Metacritic User Reviews,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews,merged_critic_score,merged_user_score,merged_critic_reviews,merged_user_reviews
0,1.0,OK Computer,Radiohead,"Alternative Rock, Art Rock","melancholic, anxious, futuristic, alienation, ...",4.23,70382,1531,16,1997,...,0.0,0.0,91,12,93,3204,91.0,93.0,12.0,3204.0
1,2.0,Wish You Were Here,Pink Floyd,"Progressive Rock, Art Rock","melancholic, atmospheric, progressive, male vo...",4.29,48662,983,12,1975,...,0.0,0.0,100,4,91,1607,100.0,91.0,4.0,1607.0
2,4.0,Kid A,Radiohead,"Art Rock, Experimental Rock, Electronic","cold, melancholic, futuristic, atmospheric, an...",4.21,58590,734,2,2000,...,8.9,1129.0,85,13,92,2862,85.0,92.0,13.0,2862.0
3,5.0,To Pimp a Butterfly,Kendrick Lamar,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","political, conscious, poetic, protest, concept...",4.27,44206,379,16,2015,...,8.8,3616.0,95,42,93,4530,95.0,93.0,42.0,4530.0
4,6.0,Loveless,My Bloody Valentine,"Shoegaze, Noise Pop","noisy, ethereal, atmospheric, romantic, dense,...",4.24,49887,1223,4,1991,...,0.0,0.0,94,9,91,1634,94.0,91.0,9.0,1634.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2303,4989.0,Abandon All Life,Nails,"Grindcore, Powerviolence","aggressive, heavy, chaotic, angry, misanthropi...",3.64,3864,36,19,2013,...,7.7,7.0,77,11,78,104,77.0,78.0,11.0,104.0
2304,4990.0,Little Dominiques Nosebleed,The Koreatown Oddity,"West Coast Hip Hop, Conscious Hip Hop, Jazz Rap","introspective, concept album, male vocals, sam...",3.66,1965,12,19,2020,...,0.0,0.0,83,4,77,32,83.0,77.0,4.0,32.0
2305,4992.0,I Am,"Earth, Wind & Fire","Disco, Funk","happy, summer, party, energetic, uplifting, lu...",3.68,1108,22,9,1979,...,0.0,0.0,60,1,73,21,60.0,73.0,1.0,21.0
2306,4994.0,Bring the Family,John Hiatt,"Singer/Songwriter, Roots Rock, Americana","love, lethargic, lonely, existential, melodic,...",3.68,727,37,29,1987,...,0.0,0.0,80,2,76,9,80.0,76.0,2.0,9.0


In [12]:
#create list of top 20 most frequent items (adjust number later). the rest goes to "other"
  #example: genre_dict = itemCounts(ds['Genres'])
ds["Genres"].value_counts()[:20]

Progressive Rock                                  42
Thrash Metal                                      35
Hard Bop                                          34
Progressive Metal                                 32
Indie Rock                                        29
Death Metal                                       28
Hard Rock                                         27
Heavy Metal                                       25
Post-Punk                                         21
Alternative Rock                                  21
Punk Rock                                         20
Post-Bop                                          20
Art Rock                                          19
Pop Rock                                          17
East Coast Hip Hop, Boom Bap                      14
East Coast Hip Hop, Boom Bap, Hardcore Hip Hop    14
Melodic Death Metal                               13
Singer/Songwriter, Folk Rock                      13
Blues Rock                                    

In [83]:
#one-hot encoding for entries with multiple items
# 1 if item is in each row entry, 0 if not

#create list of top 20 items by frequency
#input unprocessed col
def makeList(col):
  item_counts =col.value_counts()
  #print(item_counts[:5])
  top_list = item_counts.index[:20].tolist()
  return top_list


 #clean each entry of column
def wordSplit(row):
  new_row = row.split(', ')
  return new_row

#input processed col here
def makeDf(col):
  #dict will go into new df
  item_dict = {}
  for item in top_list:
    item_dict[item] = []
  for item in top_list: #for each item in top_list, append 1 to entry if item is in the row
    item_dict[item] = [1 if (item in set(row)) else 0 for row in set(col)]
    print(item, "+" , item_dict[item])
  item_df = pd.DataFrame.from_dict(item_dict)
  return item_df

In [123]:
#don't encode Genres column. use Genres[0] if Genre is missing

#for each row in ds["Genre"]
#if value is missing
#replace with first value in ds["Genres"]

missing_genre = ds.loc[ds["Genre"].isna()]
for x in missing_genre.index:
  ds.loc[x, 'Genre'] = ds.loc[x, 'Genres'][0]

for x in ds.index:
  if "," in ds.loc[x, 'Genre']:
    ds.loc[x, 'Genre'] = ds.loc[x, 'Genre'].split(',')[0]


Hip Hop              159
Alternative Rock     100
Singer-Songwriter     89
Progressive Rock      89
Indie Rock            75
                    ... 
Dubstep                1
Psychedelic Folk       1
Indietronica           1
Glitch Pop             1
Smooth Soul            1
Name: Genre, Length: 218, dtype: int64

In [47]:
ds[['Genre', 'Genres']]
ds['Genres'].isna().sum() #0.
ds['Genre'].isna().sum() #231 -> drop this column
#also possible to use value in Genre and only use first value from 'Genres' if Genre value is missing

# 'float' object has no attribute 'split': comes from Nan 
#fill nan first
ds['Descriptors']=ds['Descriptors'].fillna("None")

In [None]:
#concatenate genre_df to ds and drop 'Genres'
#combined_ds = pd.concat([ds, genre_df, desc_df], axis = 1)
#combined_ds = combined_ds.drop(['Genres', 'Genre'], axis=1)

In [100]:
ds.loc[ds["Genre"].isna()]

Unnamed: 0,Ranking,Album,Artist,Release Date,Genres,Descriptors,Average Rating,Number of Ratings,Number of Reviews,Release Month,...,Metacritic User Score,Metacritic User Reviews,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews,merged_critic_score,merged_user_score,merged_critic_reviews,merged_user_reviews
407,609.0,Don Cherry,Don Cherry,1977,Spiritual Jazz,"psychedelic, hypnotic, spiritual, avant-garde,...",3.91,2618,30,,...,0.0,0.0,90,1,79,9,90.0,79.0,1.0,9.0
415,626.0,Ask the Ages,Sonny Sharrock,6 August 1991,Avant-Garde Jazz,"instrumental, energetic, improvisation, techni...",3.90,2456,44,,...,0.0,0.0,100,1,82,5,100.0,82.0,1.0,5.0
418,629.0,Search for the New Land,Lee Morgan,July 1966,Post-Bop,"instrumental, improvisation, warm, acoustic, p...",3.92,1630,28,July,...,0.0,0.0,100,1,81,5,100.0,81.0,1.0,5.0
480,740.0,Cartola,Cartola,April 1976,Samba-choro,"melancholic, poetic, romantic, male vocals, bi...",3.91,1715,24,April,...,0.0,0.0,90,1,79,32,90.0,79.0,1.0,32.0
481,1054.0,Cartola,Cartola,1974,Samba-choro,"melodic, romantic, poetic, melancholic, bitter...",3.88,870,9,April,...,0.0,0.0,90,1,79,32,90.0,79.0,1.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2281,4941.0,Shine On Brightly,Procol Harum,December 1968,"Progressive Rock, Art Rock","male vocals, progressive, epic, psychedelic, p...",3.67,1482,58,September,...,0.0,0.0,80,1,79,10,80.0,79.0,1.0,10.0
2289,4958.0,Sunrise on the Sufferbus,Masters of Reality,9 February 1993,"Hard Rock, Blues Rock",male vocals,3.69,479,19,January,...,0.0,0.0,80,2,79,6,80.0,79.0,2.0,6.0
2301,4983.0,Demons Dance Alone,The Residents,May 2002,"Art Pop, Experimental Rock","male vocals, female vocals, introspective, mel...",3.67,651,15,May,...,0.0,0.0,80,2,71,7,80.0,71.0,2.0,7.0
2302,4985.0,The True Meaning,Cormega,25 June 2002,"East Coast Hip Hop, Hardcore Hip Hop","introspective, rhythmic, sampling, urban, boas...",3.70,388,9,June,...,0.0,0.0,80,1,78,8,80.0,78.0,1.0,8.0


In [None]:
#data processing: copied from kaggle notebook

X = hotel.copy()
y = X.pop('is_canceled')

X['arrival_date_month'] = \
    X['arrival_date_month'].map(
        {'January':1, 'February': 2, 'March':3,
         'April':4, 'May':5, 'June':6, 'July':7,
         'August':8, 'September':9, 'October':10,
         'November':11, 'December':12}
    )

features_num = [
    "lead_time", "arrival_date_week_number",
    "arrival_date_day_of_month", "stays_in_weekend_nights",
    "stays_in_week_nights", "adults", "children", "babies",
    "is_repeated_guest", "previous_cancellations",
    "previous_bookings_not_canceled", "required_car_parking_spaces",
    "total_of_special_requests", "adr",
]
features_cat = [
    "hotel", "arrival_date_month", "meal",
    "market_segment", "distribution_channel",
    "reserved_room_type", "deposit_type", "customer_type",
]

transformer_num = make_pipeline(
    SimpleImputer(strategy="constant"), # there are a few missing values
    StandardScaler(),
)
transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y, stratify=y, train_size=0.75)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]]

In [None]:
#model

model = keras.Sequential([
    layers.BatchNormalization(input_shape = input_shape),
    layers.Dense(256, activation = "relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation = "relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation = "sigmoid")
]
)


In [None]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ['binary_accuracy'])