### Retrieving data from postgres

In [20]:
import psycopg2
import numpy as np

print('connecting to postgres...')
connection = psycopg2.connect(user="max",
                              password="password",
                              host="127.0.0.1",
                              port="5432",
                              database="final")
print()
print(connection)
cursor = connection.cursor()

connecting to postgres...

<connection object at 0x7f4088f27d58; dsn: 'user=max password=xxx dbname=final host=127.0.0.1 port=5432', closed: 0>


In [21]:
# NEW QUERY

# select rows that inlcude songs that are listend to be
# at least 3 users

q = """
SELECT arist_track FROM lastfm
    GROUP BY arist_track
    HAVING COUNT(lastfm_user) > 8 
;
"""

cursor.execute(q)
lastfm_tracks = cursor.fetchall()

In [None]:
lastfm_tracks = set(track[0] for track in lastfm_tracks)

In [None]:
len(lastfm_tracks)

203763

In [None]:
q = """
SELECT * FROM lastfm
;
"""

cursor.execute(q)

In [None]:
lastfm = []
batch = cursor.fetchmany(10000)
while batch:
    for row in batch:
        if row[1] in lastfm_tracks:
            lastfm.append(row)
    batch = cursor.fetchmany(10000)

In [None]:
len(lastfm)

In [None]:
len(set(lastfm))

In [None]:
import pandas as pd

columns = [
    'username',
    'artist_track',
    'count'
    ]

df = pd.DataFrame(lastfm, columns=columns)

X = df.values
X[:,2] = X[:, 2].astype(int)

# generate unique ids for track
unique_artist_tracks = df['artist_track'].unique()

# remembering the index of each artist_track
ix_artist_track = dict(zip(unique_artist_tracks, range(len(unique_artist_tracks))))
artist_track_ix = {v:k for k, v in ix_artist_track.items()}

df['track_ix'] = df['artist_track'].apply(lambda x: ix_artist_track[x])

# geneate unique ids for each username
unique_usernames = df['username'].unique()
ix_username = dict(zip(unique_usernames, range(len(unique_usernames))))
df['username_ix'] = df['username'].apply(lambda x: ix_username[x])



In [None]:
df.head()

In [None]:
len(ix_artist_track)

__Note:__

To simplify things for now, and since each of these tracks are so-called 'top tracks', I will just use the fact that they were listened to at all instead of trying to weight it by how much they were listened to. This makes everything in the last column a 1, but this is a sparse formulation of the problem.

In [None]:
# (row, column, values)

rcv = ['username_ix', 'track_ix', 'count']
X = df[rcv].astype(int).values
X[:,2] = X[:,2].astype(bool).astype(int) # change values to be 1

In [None]:
X[:10]

Let's see what Truncated SVD can do.

In [None]:
from scipy.sparse import csr_matrix
# csr_matrix((data, (row, col))
row = X[:, 1]  # track_ixs
col = X[:, 0]  # username_ixs 
data = X[:, 2] # always 1

sparse_mat = csr_matrix((data, (row, col)))

In [None]:
sparse_mat.shape

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

In [None]:
tsvd = TruncatedSVD(n_components=10)

tsvd.fit(sparse_mat) #(sparse_mat)

In [None]:
mat_tsvd = tsvd.transform(sparse_mat)

In [None]:
mat_tsvd.shape

### Making Recommendations

I'm going to use cosine similarity to put similar songs and similar users together.


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def top_n_tracks(input_song, n):
    input_song = input_song.reshape(1, 10)

    num_songs = len(mat_tsvd)
    similarity_list = sorted(zip(cosine_similarity(input_song, mat_tsvd)[0], range(num_songs)),
                             reverse=True)
    top_n = similarity_list[1:n+1] # the most similar will always be itself
    return top_n

sample_song = mat_tsvd[98451]
top_n_tracks(sample_song, 5)



[(0.992267693335847, 15754),
 (0.9916363243034971, 13012),
 (0.9900208696782189, 50441),
 (0.9896575802415085, 155946),
 (0.989139122455639, 120523)]

In [57]:
cosine_similarity(mat_tsvd[5].reshape(1, 10), mat_tsvd)[0]

array([0.28344554, 0.5222011 , 0.19199564, ..., 0.12615911, 0.26012158,
       0.22486764])

In [39]:
sample_song = mat_tsvd[4]
top_n_tracks(sample_song, 5)

[(0.9885333648109279, 25),
 (0.9845788384997816, 43),
 (0.9845583574335708, 133640),
 (0.9821669835813039, 24),
 (0.9820738661931745, 7256)]

In [53]:
sample_n = 55555
sample_song = mat_tsvd[sample_n]
print(artist_track_ix[sample_n])
print()
# convert ix to artist_track
sim_tracks = [artist_track_ix[song[1]] for song in top_n_tracks(sample_song, 30)]
sim_tracks

Stereolab Chemical Chords



['Super Furry Animals The Very Best Of Neil Diamond',
 'Super Furry Animals Atomik Lust',
 'Mulatu Astatke Yegelle Tezeta',
 'Dungen Du E för Fin för Mig',
 'Stereolab Silver Sands',
 'The Breeders German Studies',
 'Stereolab Daisy Click Clack',
 'Stereolab The Ecstatic Static',
 "Cass McCombs What Isn't Nature",
 'Megapuss Theme from Hollywood',
 'David Byrne and Brian Eno Strange Overtones',
 'Stereolab Three Women',
 'Stereolab Neon Beanbag',
 'King Khan & The Shrines 69 Faces of Love',
 'Pink Mountaintops New Drug Queens',
 'Electrelane Bells',
 'Clinic Tomorrow',
 'King Khan & The Shrines I See Lights',
 'Jeremy Jay Slow Dance',
 'The Breeders No Way',
 'France Gall Baby Pop',
 'Atlas Sound Holiday',
 'The Clientele 6am Morningside',
 'Clinic Free Not Free',
 'Autolux The Science Of Imaginary Solutions',
 'Black Lips Slime & Oxygen',
 'Thurston Moore Fri/End',
 'Super Furry Animals Fire In My Heart',
 'Black Lips Juvenile',
 'はっぴいえんど Kaze Wo Atsumete']

In [46]:
artist_track_ix[4]

'Benoît Pioulard Weird Door'

### Defining a function to find most similar songs

In [23]:
# match lower to lower_key
# use lower_key to match on keys
# --------------

def find_similar_songs(artist_track):
    """
    artist_track is string
    """
    assert type(artist_track) == str
    
    if artist_track.lower() not in map(str.lower, ix_artist_track.keys()):
        return 'Not in the thing'
    
    track_lower = artist_track.lower()
    for song, ix in ix_artist_track.items():
        if track_lower == song.lower():
            song_ix = ix
            
    #song_ix = ix_artist_track[artist_track]
    song = mat_tsvd[song_ix]

    # convert ix to artist_track
    sim_tracks = [artist_track_ix[sim_song[1]] for sim_song in top_n_tracks(song, 30)]
    return sim_tracks

In [146]:
find_similar_songs('st. vincent laughing with a mouth of blood')

['St. Vincent The Party',
 'St. Vincent The Strangers',
 'St. Vincent Save Me From What I Want',
 'St. Vincent The Bed',
 'St. Vincent Black Rainbow',
 'St. Vincent The Neighbors',
 'St. Vincent Actor Out of Work',
 'St. Vincent Human Racing',
 'St. Vincent Just the Same But Brand New',
 'St. Vincent Marrow',
 'St. Vincent The Sequel',
 'St. Vincent What Me Worry?',
 'St. Vincent Your Lips Are Red',
 'St. Vincent Now, Now',
 'St. Vincent The Apocalypse Song',
 'St. Vincent All My Stars Aligned',
 'St. Vincent Jesus Saves, I Spend',
 'St. Vincent We Put a Pearl in the Ground',
 'St. Vincent Landmines',
 'Joanna Newsom Good Intentions Paving Company',
 'Jay Z & Kanye West Otis (Feat. Otis Redding)',
 'Owen Pallett Lewis Takes Action',
 'St. Vincent Marry Me',
 "Joanna Newsom '81",
 'Owen Pallett Scandal At The Parkade',
 'Owen Pallett Lewis Takes Off His Shirt',
 'Dirty Projectors I Dreamed I Saw St. Augustine',
 'Owen Pallett Midnight Directives',
 'Dirty Projectors Stillness Is the Mov

### Finding similar users

In [41]:
# find similarity between user and songs
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds, eigs
csc_mat = csc_matrix(sparse_mat.astype(float))
u, s, t = svds(csc_mat, k=10)


In [152]:
tracks = u
users = t.T


def get_recommendations_by_user(user, n=1):
    """
    user is index
    """
    
    def top_users(input_user, n):
        input_user = input_user.reshape(1, 10)

        num_users = len(users)
        similarity_list = sorted(zip(cosine_similarity(input_user, users)[0], range(num_users)),
                                 reverse=True)
        top_n = similarity_list[1:n+1] # the most similar will always be itself
        return top_n

    #song_ix = ix_artist_track[artist_track]
    user_info = users[user]

    # convert ix to artist_track
    sim_users = [sim_user[1] for sim_user in top_users(user_info, n)]
    
    # let's implement just recommending the songs for the most similar users
    closest_user = sim_users[0]
    
    closest_user_tracks = set(df[df.username_ix == closest_user].track_ix.values)
    user_tracks = set(df[df.username_ix == user].track_ix.values)
    recommendations = closest_user_tracks - (user_tracks & closest_user_tracks)
    recs = [artist_track_ix[rec] for rec in recommendations]
    
    return recs

sorted(get_recommendations_by_user(90))[100:130]


['Eric Hutchinson Back To Where I Was',
 'Eric Hutchinson Food Chain',
 "Eric Hutchinson It Hasn't Been Long Enough",
 "Eric Hutchinson OK, It's Alright With Me",
 'Eric Hutchinson Oh!',
 'Eric Hutchinson Outside Villanova',
 "Eric Hutchinson Rock 'N' Roll",
 "Eric Hutchinson You Don't Have To Believe Me",
 'Evan and Jaron Crazy For This Girl',
 'Fleetwood Mac Go Your Own Way',
 'Flo Rida Right Round',
 'Flo Rida Wild Ones - feat. Sia',
 'George Michael Faith',
 'George Michael One More Try',
 'Get Set Go I Hate Everyone',
 'Get Set Go Wait',
 'Graham Colton Band First Week',
 'Graham Colton Best Days',
 'Guster Amsterdam',
 'Guster Barrel of a Gun',
 'Guster Careful',
 'Guster Fa Fa',
 'Guster Keep It Together',
 'Guster One Man Wrecking Machine',
 'Guster Perfect',
 'Guster Satellite',
 'Guster Two Points for Honesty',
 'Hellogoodbye Here In Your Arms',
 'Hilary Duff Why Not',
 'Hot Hot Heat Goodnight Goodnight']

In [1]:
# see compare songs that are recommended to songs that are near the user
def eval_user(ix):
    sim_songs = set(get_reccomendations_by_user(ix, 100))

    print('RECOMMENDATIONS')
    print('=================')
    for song in sorted(sim_songs):
        print(song)
    print()
    print('User\'s Top Songs')
    print('==================')
    print(df[df.username_ix == ix].artist_track.sort_values().values)
    
#     # find songs in common
#     mask = ((df.username_ix == ix) & (df.artist_track.apply(lambda x: x in sim_songs)))
    
#     print()
#     print('Recommendations already listened to')
#     print('===================================')
#     print(df[mask].artist_track)
#     print('\n\n')

In [111]:
eval_user(567)

RECOMMENDATIONS
6th Borough Project Do It to the Max
Adrian Johnston Always summer
An April March Chick Habit
Anoraak Nightdrive With You (Fear of Tigers remix)
Appaloosa The Day (We Fell In Love)
Astrud Gilberto Photograph
Au Palais Tender Mercy
Azure Blue The Catcher in the Rye
Baxter Dury Cocaine Man
Boom Clap Bachelors Løb Stop Stå (feat. Coco O.)
Boy Friend Lazy Hunter
Boy Friend The False Cross
Burning Hearts Into the Wilderness
Bye Bye Bicycle Haby Bay
Bye Bye Bicycle Navigation
Ceo Everything Is Gonna Be Alright
Chromatics I'm On Fire
CocoRosie Lemonade - Edit
Cold Mailman Pull Yourself Together And Fall In Love With Me
College Feat. Electric Youth A Real Hero
Death in Vegas Hands Around My Throat - D.I.V.
Desire Dans Mes Reves
Desire Don't Call
Electric Youth WeAreTheYouth
Erasure Blue Savannah
Esben and the Witch Skeleton Swoon
FM Belfast I Don't Want To Go To Sleep Either
FM Belfast Lotus
FM Belfast Pump
FM Belfast VHS
Fan Death Cannibal
Gepe Victoria Roma
Groove Armada Sham

In [83]:
cosine_similarity()

Unnamed: 0,username,artist_track,count,track_ix,username_ix
0,PornGroove,Tame Impala Desire Be Desire Go,34,0,0
1,PornGroove,Tame Impala It's Not Meant To Be,34,1,0
2,PornGroove,Tame Impala Why Won't They Talk To Me?,34,2,0
3,PornGroove,Air Eat My Beat,33,3,0
4,PornGroove,Benoît Pioulard Weird Door,33,4,0


In [143]:
cosine_similarity(t.T[0].reshape(1, -1), t.T[39].reshape(1, -1))

array([[0.73556484]])