In [61]:
import numpy as np
import csv
import musicbrainzngs as mb
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
import math

In [2]:
train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'user_median.csv'
users_file = 'profiles.csv'
artists_file = 'artists.csv'
artist_genre_file = 'temp_extended_artists.csv'

In [3]:
# required to use mb API
mb.set_useragent(app='cs181practical', version='1.0')

In [4]:
genre_distributions = {}

In [58]:
# get median, standard deviation of each genre given {genre # -> list of artist ids}
def genre_dist(genre):
    if genre in genre_distributions:
        return genre_distributions[genre]
    else:
        plays = []
        with open("temp_extended_artists.csv", 'r') as extended_artists:
            tables = csv.reader(extended_artists, delimiter = ',', quotechar = '"')
            next(tables, None)
            for row in tables:
                if row[12] == genre:
                    plays.append(int(row[11]))
        #print (plays)
        median = np.median(plays)
        stddev = np.std(plays)
        genre_distributions[genre] = (median, stddev)
        return (median, stddev)

In [6]:
artist_scores = {}

In [54]:
def artist_dist(artist_id):
    if artist_id in artist_scores:
        return artist_scores[artist_id]
    else:
        with open("temp_extended_artists.csv", 'r') as extended_artists:
            tables = csv.reader(extended_artists, delimiter = ',', quotechar = '"')
            next(tables, None)
            for row in tables:
                if row[1] == artist_id:
                    genre_median, genre_dev = genre_dist(row[12])
                    total_plays = int(row[11])
                    if genre_dev == 0:
                        artist_scores[artist_id] = 0
                        return 0
                    else:
                        artist_scores[artist_id] = (total_plays - genre_median) / genre_dev
                        return (total_plays - genre_median) / genre_dev

In [40]:
# create mapping of artists to genres

artist_genre_map = {}
with open(artist_genre_file, 'r') as artist_genre_fh:
    artist_genre_csv = csv.reader(artist_genre_fh, delimiter=',', quotechar='"')
    next(artist_genre_csv, None)
    for row in artist_genre_csv:
        # id,artist,group,person,US,begin,median,mean,min_plays,max_plays,std_dev_of_plays,total_plays,genre
        artist = row[1]
        genre  = int(row[-1])
        if not artist in artist_genre_map:
            artist_genre_map[artist] = genre

In [41]:
# read training data, aggregating play count by genre

train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
        
        genre = artist_genre_map[artist]
    
        if not user in train_data:
            train_data[user] = {}
        
        if not genre in train_data[user]:
            train_data[user][genre] = {}
            
        if not "list" in train_data[user][genre]:
            train_data[user][genre]["list"] = []
            
        train_data[user][genre]["list"].append(int(plays))

In [42]:
# for each user and genre, find median and std dev

for user in train_data:
    for genre in train_data[user]:
        train_data[user][genre]["med"] = np.median(np.array(train_data[user][genre]["list"]))
        train_data[user][genre]["std_dev"] = np.std(np.array(train_data[user][genre]["list"]))

In [43]:
# determine users' median, to be used if user-genre pair hasn't been encountered

plays_array  = []
user_medians = {}
user_stds = {}
for user in train_data:
    user_plays = []
    for genre in train_data[user]:
        plays_array += train_data[user][genre]["list"]
        user_plays += train_data[user][genre]["list"]

    user_medians[user] = np.median(np.array(user_plays))
    user_stds[user] = np.std(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [65]:
# given a z score, user, and artist, output an appropriate prediction

def predict_from_z(z, user, artist):
    genre = artist_genre_map[artist]
    
    if math.isnan(z):
        if genre in train_data[user]:
            return train_data[user][genre]["med"]
        else:
            return user_medians[user]
    
    # known user-genre association
    if genre in train_data[user]:
        med = train_data[user][genre]["med"]
        std_dev = train_data[user][genre]["std_dev"]
        pred = int((std_dev * z) + med)
    else:
        #print (user_stds[user], z, user_medians[user])
        pred = int((user_stds[user] * z) + user_medians[user])
            
    return pred

In [45]:
def predict(user, artist):
    z = artist_dist(artist)
    return predict_from_z(z, user, artist)

In [67]:
soln_file = "distributions_1.csv"
with open("test.csv", 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w', newline='') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"')
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            #data = np.asarray(users[user]).reshape(1,-1)
            #predicted_plays = round(model.predict(data)[0],4)
            #if predicted_plays < 0:
                #predicted_plays = users[user][3]
            pred = predict(user, artist)
            
            if pred < 0:
                pred = 0
            soln_csv.writerow([id, pred])