This example show how a recommendation engine, based on matrix factorization, can be implemented in MemSQL 

Prerequisites
To run code in this notebook you'll need:

A MemSQL instance. You can get a free trial cloud cluster at https://portal.memsql.com/

In [1]:
import csv
import base64
import struct
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from memsql.common import database

from lib import memsql_csv

# Fetching dataset 

In [2]:
# Load the MovieLens 100k dataset. At least 4/5 star ratings are treated as positive.
data = fetch_movielens(min_rating=4.0)
print("Successfully loaded dataset")
print("Data shape: ", data['train'].shape)

Successfully loaded dataset
Data shape:  (943, 1682)


# Matrix factorisation

In [3]:
# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(data['train'], epochs=30)
assert len(model.user_biases) == model.user_embeddings.shape[0]
assert len(model.item_biases) == model.item_embeddings.shape[0]
print("Factorisation successful")

Factorisation successful


In [4]:
print("User matrix shape: ", model.user_embeddings.shape)
print("User biases count: ", len(model.user_biases))
print("Movie matrix shape: ", model.item_embeddings.shape)
print("Movie biases count: ", len(model.item_biases))

User matrix shape:  (943, 10)
User biases count:  943
Movie matrix shape:  (1682, 10)
Movie biases count:  1682


# Connecting to MemSQL
Here you will need to specify credentials for your MemSQL instance in order to connect

In [5]:
memsql_host="<enter your memsql host>"
memsql_port="3306"
memsql_user="root"
memsql_password=""

memsql_conn = database.connect(
    host=memsql_host, port=memsql_port, 
    user=memsql_user, password=memsql_password)

memsql_conn.query('CREATE DATABASE IF NOT EXISTS testrec');
memsql_conn.query('USE testrec');

# Uploading data to MemSQL

In [6]:
delim, lineend = ',', '\n'

In [7]:
with open('ex5_users.csv', mode='w') as users_file:
    user_writer = csv.DictWriter(users_file, ['id', 'feature_vector_hex', 'feature_vector', 'bias'], 
                                     delimiter=',', lineterminator='\n')
    user_writer.writeheader()
    for i in range(len(model.user_biases)):
        features_hex = struct.pack(f'{len(model.user_embeddings[i])}f', *model.user_embeddings[i]).hex()
        user_writer.writerow({'id': i,
                              'feature_vector_hex': features_hex,
                              'feature_vector': '',
                              'bias': model.user_biases[i]
                             })

In [8]:
with open('ex5_movies.csv', mode='w', encoding='utf-8') as movies_file:
    movie_writer = csv.DictWriter(movies_file, ['id', 'feature_vector_hex', 'feature_vector', 'bias'], 
                                     delimiter=',', lineterminator='\n')
    movie_writer.writeheader()
    for i in range(len(model.item_biases)):
        features_hex = struct.pack(f'{len(model.item_embeddings[i])}f', *model.item_embeddings[i]).hex()
        movie_writer.writerow({'id': i,
                              'feature_vector_hex': features_hex,
                              'feature_vector': '',
                              'bias': model.item_biases[i]
                             })

In [9]:
memsql_conn.query('DROP TABLE IF EXISTS users')
memsql_conn.query('CREATE TABLE users (id INT UNSIGNED, feature_vector_hex CHAR(200), feature_vector BLOB, bias DOUBLE)')
memsql_csv.load_csv_to_existing_table('ex5_users.csv', 'users', memsql_conn)
memsql_conn.query('UPDATE users SET feature_vector=UNHEX(feature_vector_hex)')

943

In [10]:
memsql_conn.query('DROP TABLE IF EXISTS movies')
memsql_conn.query('CREATE TABLE movies (id INT UNSIGNED, feature_vector_hex CHAR(200), feature_vector BLOB, bias DOUBLE)')
memsql_csv.load_csv_to_existing_table('ex5_movies.csv', 'movies', memsql_conn)
memsql_conn.query('UPDATE movies SET feature_vector=UNHEX(feature_vector_hex)')

1682

# Cross validation

In [11]:
# Find first five recomendations for user with id 0 in MemSQL
memsql_conn.query('SELECT movies.id, DOT_PRODUCT(users.feature_vector, movies.feature_vector) + users.bias + movies.bias as score from users JOIN movies WHERE users.id = 0 ORDER BY score DESC limit 5 ')

[Row({'id': 99, 'score': -1.5214905691525045}),
 Row({'id': 49, 'score': -1.6062639878777538}),
 Row({'id': 55, 'score': -1.9928937136588354}),
 Row({'id': 167, 'score': -2.032031638431872}),
 Row({'id': 126, 'score': -2.1413611450991814})]

In [12]:
# Find first five recomendations for user with id 0 using python module
pred = model.predict(user_ids=[0], item_ids=list(range(len(model.item_biases))))
print(*sorted(enumerate(pred), key=lambda it: -it[1])[:5], sep='\n')

(99, -1.5214910507202148)
(49, -1.6062639951705933)
(55, -1.9928940534591675)
(167, -2.032031536102295)
(126, -2.1413607597351074)
