In [68]:
import numpy as np
import csv
import musicbrainzngs as mb
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

In [2]:
train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'user_median.csv'
users_file = 'profiles.csv'
artists_file = 'artists.csv'

In [3]:
# required to use mb API
mb.set_useragent(app='cs181practical', version='1.0')

In [4]:
# adds indicator features for given target values
def add_indicators(result, feats, targets):
    info = []
    for (feat, target) in zip(feats, targets):
        if feat in result:
            if result[feat] == target:
                info.append(1)
            else:
                info.append(0)
        else:
            info.append(-1)
    return info

In [5]:
# gets artist info from musicbrainz API
def artist_info(artist_id):
    result = mb.get_artist_by_id(artist_id)
    
    # indicators for group type and US-based
    info = add_indicators(result["artist"], ["type", "type", "country"], ["Group", "Person", "US"])
    
    # when artist started
    try:
        info = info + [int(result.life-span.begin[:4])]
    except:
        # default is missing begin year
        info = info + [-1]
    
    return info

In [15]:
# gets info for all artists from musicbrainz API

artist_data = {}
artist_ids = [""] * 2000
with open(artists_file, 'r', encoding='utf8') as artists_fh:
    artists_csv = csv.reader(artists_fh, delimiter=',')
    next(artists_csv, None)
    artist_counter = 0
    for row in artists_csv:
        artist = row[0]
        name = row[1]
        if not artist in artist_data:
            artist_ids[artist_counter] = artist
            artist_data[artist] = {}
            artist_data[artist]["id"] = artist_counter
            artist_counter = artist_counter + 1
            
            try:
                artist_data[artist]["feats"] = artist_info(artist)
            except:
                artist_data[artist]["feats"] = [-1,-1,-1,-1]

In [102]:
# print artist info (one-time)

with open('artist_data.csv', 'w') as artist_data_file:
    artist_csv = csv.writer(artist_data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    artist_csv.writerow(['id', 'artist','group','person','US','begin'])
    for artist in artist_data:
        row_to_write = [artist_data[artist]["id"], artist] + artist_data[artist]["feats"]
        artist_csv.writerow(row_to_write)

In [None]:
# load artist info from file
artist_data = {}


In [17]:
# collect data on plays per user
user_play_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in itertools.islice(train_csv, 1, 1000):
        user   = row[0]
        artist = row[1]
        plays  = row[2]
        
        artist_id = artist_data[artist]["id"]
    
        if not user in user_play_data:
            user_play_data[user] = {}
        
        user_play_data[user][artist_id] = int(plays)

In [18]:
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in user_play_data.items():
    user_plays = []
    for artist_id, plays in user_data.items():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [93]:
def artists_to_nums():
    artist_count = 0
    artists = {}
    with open("artists.csv", 'r', encoding='utf8') as artists_fh:
        artists_csv = csv.reader(artists_fh, delimiter = ',', quotechar = '"')
        next(artists_csv, None)
        for row in artists_csv:
            artists[row[0]] = artist_count
            artist_count += 1
    return artists, artist_count

In [98]:
def favorite_artists(n):
    artists, artist_count = artists_to_nums()
    users = {}
    with open("train.csv", 'r', encoding='utf8') as train_fh:
        train_csv = csv.reader(train_fh, delimiter = ',', quotechar = '"')
        next(train_csv, None)
        for row in train_csv:
            user = row[0]
            artist = row[1]
            plays = int(row[2])

            if user not in users:
                users[user] = []

            users[user].append((plays, artist))
            users[user].sort(reverse = True)
            
    with open('user_favorites.csv', 'w') as fav_file:
        fav_csv = csv.writer(fav_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        fav_csv.writerow(['user', 'sex','age','country'] + [i for i in range(artist_count)] + ['med'])
        with open("profiles.csv", 'r') as prof_fh:
            prof_csv = csv.reader(prof_fh, delimiter = ',', quotechar = '"')
            next(prof_csv, None)
            for row in prof_csv:
                favorites = [0 for i in range(artist_count)]
                user = row[0]
                for j in range(n):
                    if len(users[user]) > j:
                        artist_name = users[user][j][1]
                        artist_num = artists[artist_name]
                        favorites[artist_num] = 1
                row_to_write = [user_data[user]["sex"], user_data[user]["age"], user_data[user]["country"]] + favorites + [user_data[user]["med_count"]]
                fav_csv.writerow(row_to_write)

In [99]:
favorite_artists(10)

In [19]:
# returns top 3 listened-to artists for a user
def get_top_3(user):
    vals = list(user_play_data[user].values())
    keys = list(user_play_data[user].keys())
    sorted_vals = vals.sort()
    top_3 = []
    for i in range(3):
        try:
            top_3.append(keys[vals.index(sorted_vals[-i])])
        except:
            top_3.append(-1)
    return top_3

In [23]:
# extract user features from profiles.csv

user_data = {}
    
# stores mapping of country strings to int ids
countries = {}
country_counter = 0
# default for missing country names
countries[""] = -1
    
with open(users_file, 'r') as users_fh:
    users_csv = csv.reader(users_fh, delimiter=',', quotechar='"')
    next(users_csv, None)
    for row in users_csv:
        user = row[0]
        sex = row[1]
        age = row[2]
        country = row[3]
            
        # update country ids dict
        if not country in countries:
            countries[country] = country_counter
            country_counter += 1
            
        if not user in user_data:
            user_data[user] = {}
                
            # default for missing sex
            user_data[user]["sex"] = 0
                
            if sex == "f":
                user_data[user]["sex"] = 1
            if sex == "m":
                user_data[user]["sex"] = 2
                
            try:
                user_data[user]["age"] = int(age)
            except:
                # default for missing age
                user_data[user]["age"] = -1
                
            user_data[user]["country"] = countries[country]
            
            try:
                user_data[user]["med_count"] = user_medians[user]
            except:
                user_data[user]["med_count"] = global_median
                
            if user in user_play_data:
                vals = list(user_play_data[user].values())
                keys = list(user_play_data[user].keys())
                sorted_vals = vals.sort()
                top_3 = []
                for i in range(3):
                    try:
                        top_3.append(keys[vals.index(sorted_vals[-i])])
                    except:
                        top_3.append(-1)

                user_data[user]["top1"] = top_3[0]
                user_data[user]["top2"] = top_3[1]
                user_data[user]["top3"] = top_3[2]
            else:
                user_data[user]["top1"] = -1
                user_data[user]["top2"] = -1
                user_data[user]["top3"] = -1
            
            # TODO: proprocess user data with clusters, add feature for assigned cluster

In [60]:
# preprocess user data via clustering

# convert user_data dict to array
users = []
user_counter = 0
for user in user_data:
    # id corresponds to position in X array that we will send to KMeans
    user_data[user]["id"] = user_counter
    
    users.append([user_data[user]["age"], user_data[user]["country"],
                 user_data[user]["top1"], user_data[user]["top2"], user_data[user]["top3"]])
X_users = np.array(users)

In [74]:
kmeans = KMeans(n_clusters=10).fit(X_users)
user_clusters = kmeans.labels_

In [75]:
user_clusters[:50]

array([0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 8, 0, 0, 0, 0, 8, 3, 0, 0, 3, 3, 0, 8,
       0, 5, 5, 0, 5, 0, 5, 5, 0, 0, 0, 5, 5, 0, 5, 3, 5, 5, 0, 0, 5, 3, 5,
       3, 8, 5, 8])

In [30]:
train_data = {}

In [83]:
# Load the training data.
X_train = []
Y_train = []
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in itertools.islice(train_csv, 0, 8000):
        user   = row[0]
        artist = row[1]
        plays  = row[2]
        
        user_id = user_data[user]["id"]
        
        # concat artist features and user features into one row
        x_row = artist_data[artist]["feats"] + X_users[user_id].tolist() + [user_clusters[user_id], user_data[user]["med_count"], user_data[user]["sex"]]
        
        # add row of features to X
        X_train.append(x_row)
        
        # add play count to Y
        Y_train.append(int(plays))
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = int(plays)

X_train = np.array(X_train)
Y_train = np.array(Y_train)

In [84]:
len(Y_train)

8000

In [85]:
# hold some data out for validation
X_training, X_validating, Y_training, Y_validating = train_test_split(
    X_train, Y_train, test_size=0.33, random_state=42)

In [86]:
print("learning...")
#model = RandomForestRegressor(n_estimators=50)
model = MLPRegressor()
model.fit(X_training, Y_training)
print("done learning")
print("learning score: ", cross_val_score(model, X_training, Y_training, cv=5))
print("mean absolute error: ", mean_absolute_error(Y_training, model.predict(X_training)))

learning...
done learning
learning score:  [ 0.09052084  0.29871062  0.18784208  0.12236137  0.12487711]
mean absolute error:  203.254664199


In [87]:
print("predicting...")
preds = model.predict(X_validating)
print("mean absolute error: ", mean_absolute_error(Y_validating, preds))


predicting...
mean absolute error:  210.089786014


In [None]:
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                soln_csv.writerow([id, user_medians[user]])
            else:
                print "User", id, "not in training data."
                soln_csv.writerow([id, global_median])