In [130]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.externals import joblib

In [131]:
def video_subgenres(video_id):
    video_subgenres = video_data[video_data['Xid'] == video_id].Subgenres_list.values
    if video_subgenres.size == 0:
        return set()
    else:
        sgs = [s.strip("''").lower() for s in video_subgenres[0].strip("[]").split(", ")]
        return set(sgs)


def user_subgenres(liked_videos):  
    subgenres = set()
    for video_id in liked_videos:
        subgenres = subgenres.union(video_subgenres(video_id))
    return subgenres
  
    
def user_vector(slist, sliked):
    vector = np.zeros(len(slist))
    for (index, subgenre) in enumerate(slist):
        if(subgenre in sliked):
            vector[index] = 1
    return vector


def normalize_vector(user_vector):
    n = np.count_nonzero(user_vector)
    return [d / n for d in user_vector]

def write_output(cluster):
    if cluster == -1:
        filename = "playlists/" + "zero_likes" + "_playlist.tsv"
    else:
        filename = "playlists/" + str(cluster) + "_playlist.tsv"       
    data = pd.read_csv(filename, sep = '\t', names = ['artist', 'title', 'url'])
    data.to_json('output.json', orient = 'records')
    
def read_user_likes(filename):
    input_pd = pd.read_json(filename)
    return list(input_pd[input_pd['rating'] == 1]['item_id'].values)

In [134]:
Subgenre_list = [
    'alternative rock', 
    'alternative/indie general', 
    'blues general', 
    'classic rock', 
    'country general', 
    'dance pop', 
    'dancehall', 
    'edm', 
    'funk', 
    'hard rock/metal general', 
    'house', 
    'indie pop', 
    'jazz general', 
    'latin general', 
    'pop general', 
    'punk', 
    'r&b/soul general', 
    'rap/hip-hop general', 
    'rock general', 
    'trap'
]

video_data = pd.read_csv("video_data_reduced.csv")
model = joblib.load('kmeans.pkl') 

print 'Subgenre list:'
print Subgenre_list  

Subgenre list:
['alternative rock', 'alternative/indie general', 'blues general', 'classic rock', 'country general', 'dance pop', 'dancehall', 'edm', 'funk', 'hard rock/metal general', 'house', 'indie pop', 'jazz general', 'latin general', 'pop general', 'punk', 'r&b/soul general', 'rap/hip-hop general', 'rock general', 'trap']


In [139]:
user_liked_videos = read_user_likes("quiz.json")


if(user_liked_videos == []):
    write_output(-1)
    raise ValueError("No liked videos")
     

print 'User likes:', user_liked_videos
print 'User likes intersection:', video_data[video_data['Xid'].isin(user_liked_videos)].shape[0]

user_liked_subgenres = user_subgenres(user_liked_videos)

print 'User liked subgenres:'
print user_liked_subgenres


print 'User vector:'
vector = user_vector(Subgenre_list, user_liked_subgenres)
print vector

print 'Normalized vector:'
normalized_vector = np.array(normalize_vector(vector)).reshape(1, -1)


print normalized_vector

cluster = model.predict(normalized_vector)[0]
print 'Prediction:'
print cluster

print 'Writing output...'
write_output(cluster)
print 'DONE'
print 'СДЕЛАНО'

User likes: [558629, 862827, 865279, 870140, 843745, 842781, 862565, 862098]
User likes intersection: 1
User liked subgenres:
set(['classic rock'])
User vector:
[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
Normalized vector:
[[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]]
Prediction:
1
Writing output...
DONE
СДЕЛАНО
