Permalink
Find file
8af198f Jan 9, 2017
129 lines (99 sloc) 4.3 KB
"""
MATRIX FACTORIZATION & DIMENSIONALITY REDUCTION
Case study: Recommending Products
Models:
Collaborative filtering
Matrix factorization
PCA
Algorithms:
Coordinate descent
Eigen decomposition
SVD
Concepts:
Matrix completion, eigenvalues, random projections, cold-start problem, diversity, scaling up
"""
import os
from math import sqrt
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
def load_music_data(file_name):
"""Get reviews data, from local csv."""
if os.path.exists(file_name):
print("-- " + file_name + " found locally")
df = pd.read_csv(file_name)
return df
def values_to_map_index(values):
map_index = {}
idx = 0
for val in values:
map_index[val] = idx
idx += 1
return map_index
def print_most_popular_songs(song):
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print "Words in vocabulary:", vocab
# Sum up the counts of each vocabulary word
dist = np.sum(song, axis=0)
# For each, print the vocabulary word and the number of times it
# appears in the training set
print "Words frequency..."
for tag, count in zip(vocab, dist):
print count, tag
def predict(ratings, similarity, type='user'):
if type == 'user':
mean_user_rating = ratings.mean(axis=1)
# You use np.newaxis so that mean_user_rating has same format as ratings
ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array(
[np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred
def rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return sqrt(mean_squared_error(prediction, ground_truth))
if __name__ == "__main__":
# Load music data
song_data = load_music_data("song_data.csv")
# Reduce complexity by getting first n elements
n = 10000
song_data = song_data.head(n)
user_idx = values_to_map_index(song_data.user_id.unique())
song_idx = values_to_map_index(song_data.song_id.unique())
print "-- Explore data"
print song_data.head()
print "-- Showing the most popular songs in the dataset"
unique, counts = np.unique(song_data["song"], return_counts=True)
popular_songs = dict(zip(unique, counts))
df_popular_songs = pd.DataFrame(popular_songs.items(), columns=["Song", "Count"])
df_popular_songs = df_popular_songs.sort_values(by=["Count"], ascending=False)
print df_popular_songs.head()
n_users = song_data.user_id.unique().shape[0]
n_items = song_data.song_id.unique().shape[0]
print "Number of users = " + str(n_users) + " | Number of songs = " + str(n_items)
train_data, test_data = train_test_split(song_data, test_size=0.25)
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
train_data_matrix[user_idx[line[1]], song_idx[line[2]]] = line[3]
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
test_data_matrix[user_idx[line[1]], song_idx[line[2]]] = line[3]
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
sparsity = round(1.0 - len(song_data) / float(n_users * n_items), 3)
print 'The sparsity level is ' + str(sparsity * 100) + '%'
# get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k=20)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix))