In [1]:
from os import path, environ
import pandas as pd
import numpy as np

In [2]:
data_dir = '/home/mershov/data/ml-latest-small'
environ["DATA_DIR"] = data_dir

In [3]:
data = pd.read_csv(
    path.join(data_dir, "ratings.csv"),
    sep=",",
)
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
userId       100004 non-null int64
movieId      100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
# заменим строковые идентификаторы числовыми кодами
# добавляем к индексам единицы, потому что в mrec,
# который будем использовать для оценки качества, индексы начинаются с единицы
data["user_id"] = data["userId"].astype("category").cat.codes.copy() + 1
data["movie_id"] = data["movieId"].astype("category").cat.codes.copy() + 1
# убираем лишние колонки
data.drop(["userId", "movieId", "timestamp"], axis=1, inplace=True)
#data.sort_values(['rating', 'user_id', 'movie_id'], ascending=[1, 2, 0], axis=1, inplace=True)
data.head()

Unnamed: 0,rating,user_id,movie_id
0,2.5,1,31
1,3.0,1,834
2,3.0,1,860
3,2.0,1,907
4,4.0,1,932


In [6]:
data = data[['user_id', 'movie_id', 'rating']]
data.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,834,3.0
2,1,860,3.0
3,1,907,2.0
4,1,932,4.0


In [47]:
data.to_csv(
    path.join(data_dir, "ratings.tsv"),
    sep='\t',
    index=False,
    header=False,
)

In [48]:
#preparing train/test
!mrec_prepare \
    --dataset $DATA_DIR/ratings.tsv \
    --outdir $DATA_DIR/splits \
    --num_splits 1 \
    --rating_thresh 4 \
    --test_size 0.2 \
    --binarize

[2017-12-05 23:50:17,128] INFO: sorting input data...
[2017-12-05 23:50:17,364] INFO: creating split 0: /home/mershov/data/ml-latest-small/splits/ratings.tsv.train.0 /home/mershov/data/ml-latest-small/splits/ratings.tsv.test.0
[2017-12-05 23:50:18,367] INFO: cleaning up...
[2017-12-05 23:50:18,370] INFO: done


In [52]:
!ipcluster start \
    -n4 \
    --daemonize

In [54]:
#training knn model
!mrec_train \
    -n4 \
    --input_format tsv \
    --train $DATA_DIR/splits/ratings.tsv.train.* \
    --overwrite \
    --outdir $DATA_DIR/knn_model \
    --model knn

[2017-12-06 00:05:44,046] INFO: processing /home/mershov/data/ml-latest-small/splits/ratings.tsv.train.0...
[2017-12-06 00:05:44,046] INFO: finding number of items...
[2017-12-06 00:05:45,466] INFO: 671 users and 9066 items
[2017-12-06 00:05:45,467] INFO: creating sims directory /home/mershov/data/ml-latest-small/knn_model/ratings.tsv.train.0-sims...
[2017-12-06 00:05:45,797] INFO: creating tasks...
[2017-12-06 00:05:45,797] INFO: running 4 tasks in parallel across ipython engines...
[2017-12-06 00:06:51,583] INFO: checking output files...
[2017-12-06 00:06:51,584] INFO: SUCCESS: all tasks completed
[2017-12-06 00:06:51,584] INFO: concatenating 4 partial output files...
[2017-12-06 00:06:51,708] INFO: removing partial output files...
[2017-12-06 00:06:52,445] INFO: loading 9066 items in CosineKNNRecommender model from /home/mershov/data/ml-latest-small/knn_model/ratings.tsv.train.0.sims.tsv
[2017-12-06 00:07:02,185] INFO: done


In [11]:
#training popularity model
!mrec_train \
    -n4 \
    --input_format tsv \
    --train $DATA_DIR/splits/ratings.tsv.train.* \
    --overwrite \
    --outdir $DATA_DIR/popularity_model \
    --model popularity

[2017-12-06 00:43:29,472] INFO: processing /home/mershov/data/ml-latest-small/splits/ratings.tsv.train.0...
[2017-12-06 00:43:33,335] INFO: done


In [7]:
#predicting and evaluating knn model
!mrec_predict \
    --input_format tsv \
    --test_input_format tsv \
    --train $DATA_DIR/splits/ratings.tsv.train.0 \
    --modeldir $DATA_DIR/knn_model \
    --outdir $DATA_DIR/recs_knn

[2017-12-06 00:28:06,537] INFO: processing /home/mershov/data/ml-latest-small/splits/ratings.tsv.train.0...
[2017-12-06 00:28:06,537] INFO: creating recs directory /home/mershov/data/ml-latest-small/recs/ratings.tsv.train.0-recs...
[2017-12-06 00:28:06,596] INFO: checking for existing output recs...
[2017-12-06 00:28:06,597] INFO: creating tasks...
[2017-12-06 00:28:06,597] INFO: loading dataset to get size...
[2017-12-06 00:28:08,646] INFO: loading model to get size...
[2017-12-06 00:28:10,599] INFO: created 1 tasks, 723 users per task
[2017-12-06 00:28:10,599] INFO: running in parallel across ipython engines...
[2017-12-06 00:28:17,903] INFO: checking output files...
[2017-12-06 00:28:17,903] INFO: SUCCESS: all tasks completed
[2017-12-06 00:28:17,903] INFO: concatenating 1 partial output files...
[2017-12-06 00:28:17,950] INFO: removing partial output files...
[2017-12-06 00:28:17,951] INFO: done
CosineKNNRecommender(k=100)
mrr            0.4552 +/- 0.0000
prec@5         0.2347 +/- 

In [12]:
#predicting and evaluating popularity model
!mrec_predict \
    --input_format tsv \
    --test_input_format tsv \
    --train $DATA_DIR/splits/ratings.tsv.train.0 \
    --modeldir $DATA_DIR/popularity_model \
    --outdir $DATA_DIR/recs_popularity

[2017-12-06 00:47:38,393] INFO: processing /home/mershov/data/ml-latest-small/splits/ratings.tsv.train.0...
[2017-12-06 00:47:38,393] INFO: creating recs directory /home/mershov/data/ml-latest-small/recs_popularity/ratings.tsv.train.0-recs...
[2017-12-06 00:47:38,398] INFO: checking for existing output recs...
[2017-12-06 00:47:38,400] INFO: creating tasks...
[2017-12-06 00:47:38,400] INFO: loading dataset to get size...
[2017-12-06 00:47:39,880] INFO: loading model to get size...
[2017-12-06 00:47:39,884] INFO: created 1 tasks, 671 users per task
[2017-12-06 00:47:39,884] INFO: running in parallel across ipython engines...
[2017-12-06 00:48:10,620] INFO: checking output files...
[2017-12-06 00:48:10,620] INFO: SUCCESS: all tasks completed
[2017-12-06 00:48:10,620] INFO: concatenating 1 partial output files...
[2017-12-06 00:48:10,721] INFO: removing partial output files...
[2017-12-06 00:48:10,721] INFO: done
ItemPop
mrr            0.2910 +/- 0.0000
prec@5         0.1377 +/- 0.0000
pr

In [7]:
#training slim model
!mrec_train \
    -n4 \
    --input_format tsv \
    --train $DATA_DIR/splits/ratings.tsv.train.* \
    --overwrite \
    --outdir $DATA_DIR/slim_model \
    --model slim

[2017-12-06 11:44:52,269] INFO: processing /home/mershov/data/ml-latest-small/splits/ratings.tsv.train.0...
[2017-12-06 11:44:52,270] INFO: finding number of items...
[2017-12-06 11:44:53,915] INFO: 671 users and 9066 items
[2017-12-06 11:44:53,915] INFO: creating sims directory /home/mershov/data/ml-latest-small/slim_model/ratings.tsv.train.0-sims...
[2017-12-06 11:44:54,096] INFO: creating tasks...
[2017-12-06 11:44:54,097] INFO: running 4 tasks in parallel across ipython engines...
[2017-12-06 11:46:59,850] INFO: checking output files...
[2017-12-06 11:46:59,872] INFO: SUCCESS: all tasks completed
[2017-12-06 11:46:59,872] INFO: concatenating 4 partial output files...
[2017-12-06 11:47:02,079] INFO: removing partial output files...
[2017-12-06 11:47:02,165] INFO: loading 9066 items in SLIM model from /home/mershov/data/ml-latest-small/slim_model/ratings.tsv.train.0.sims.tsv
[2017-12-06 11:47:12,668] INFO: done


In [8]:
#predicting and evaluating slim model
!mrec_predict \
    --input_format tsv \
    --test_input_format tsv \
    --train $DATA_DIR/splits/ratings.tsv.train.0 \
    --modeldir $DATA_DIR/slim_model \
    --outdir $DATA_DIR/recs_slim

[2017-12-06 11:47:43,170] INFO: processing /home/mershov/data/ml-latest-small/splits/ratings.tsv.train.0...
[2017-12-06 11:47:43,171] INFO: creating recs directory /home/mershov/data/ml-latest-small/recs_slim/ratings.tsv.train.0-recs...
[2017-12-06 11:47:43,351] INFO: checking for existing output recs...
[2017-12-06 11:47:43,353] INFO: creating tasks...
[2017-12-06 11:47:43,353] INFO: loading dataset to get size...
[2017-12-06 11:47:44,901] INFO: loading model to get size...
[2017-12-06 11:47:46,838] INFO: created 5 tasks, 147 users per task
[2017-12-06 11:47:46,838] INFO: running in parallel across ipython engines...
[2017-12-06 11:48:02,618] INFO: checking output files...
[2017-12-06 11:48:02,619] INFO: SUCCESS: all tasks completed
[2017-12-06 11:48:02,619] INFO: concatenating 5 partial output files...
[2017-12-06 11:48:02,656] INFO: removing partial output files...
[2017-12-06 11:48:02,657] INFO: done
SLIM(SGDRegressor(alpha=0.0011, average=False, epsilon=0.1, eta0=0.01,
       fit_