<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Movie-recommendation" data-toc-modified-id="Movie-recommendation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Movie recommendation</a></span><ul class="toc-item"><li><span><a href="#Dataset" data-toc-modified-id="Dataset-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Dataset</a></span></li><li><span><a href="#Evaluation-Protocol" data-toc-modified-id="Evaluation-Protocol-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Evaluation Protocol</a></span></li><li><span><a href="#Models" data-toc-modified-id="Models-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Models</a></span><ul class="toc-item"><li><span><a href="#ALS" data-toc-modified-id="ALS-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span><a href="https://spark.apache.org/docs/latest/ml-collaborative-filtering.html#explicit-vs-implicit-feedback" target="_blank">ALS</a></a></span></li><li><span><a href="#Ваша-формулировка" data-toc-modified-id="Ваша-формулировка-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Ваша формулировка</a></span></li></ul></li><li><span><a href="#Evaluation-Results" data-toc-modified-id="Evaluation-Results-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Evaluation Results</a></span></li></ul></li></ul></div>

# Movie recommendation

Ваша задача - рекомендация фильмов для пользователей


In [1]:
%matplotlib inline
%config InlineBackend.figure_format ='retina'

import os
import sys
import glob
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import spatial

import pyspark
from pyspark.conf import SparkConf
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window


spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("spark_sql_examples") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

## Dataset 

`MovieLens-25M`

In [2]:
DATA_PATH = '/workspace/data/ml-25m'

RATINGS_PATH = os.path.join(DATA_PATH, 'ratings.csv')
MOVIES_PATH = os.path.join(DATA_PATH, 'movies.csv')
TAGS_PATH = os.path.join(DATA_PATH, 'tags.csv')

In [3]:
SAMPLING_RATE = 0.01
SEED = 90

In [4]:
import pyspark.sql.functions as F
from pyspark.sql.types import *


def read_df(path, sampling_rate=None):
    df = sqlContext.read.format("com.databricks.spark.csv") \
        .option("delimiter", ",") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load('file:///' + path)
    if sampling_rate:
        df = df.sample(False, sampling_rate, SEED)
    return df

In [5]:
ratings_df = read_df(RATINGS_PATH, SAMPLING_RATE)
ratings_df.count()

249572

## Evaluation Protocol

Так как мы хотим оценивать качество разных алгоритмов рекомендаций, то в первую очередь нам нужно определить
* Как разбить данные на `Train`/`Validation`/`Test`;
* Какие метрики использовать для оценки качества.

### Splits

In [6]:
TILES = 10

user_window = Window.orderBy('timestamp').partitionBy('userId')

tiled_ratings_df = ratings_df \
    .withColumn('tile', F.ntile(TILES).over(user_window)) \

tiled_ratings_df \
    .take(1)

[Row(userId=1238, movieId=4011, rating=5.0, timestamp=1495751585, tile=1)]

In [7]:
train_ratings_df = tiled_ratings_df \
    .filter(F.col('tile') <= 8) \
    .drop('tile') \
    .cache()

print(train_ratings_df.count())

train_ratings_df.head()

239953


Row(userId=1238, movieId=4011, rating=5.0, timestamp=1495751585)

In [8]:
dev_ratings_df = tiled_ratings_df \
    .filter(F.col('tile') == 9) \
    .drop('tile') \
    .cache()

print(dev_ratings_df.count())

dev_ratings_df.head()

5333


Row(userId=9852, movieId=146926, rating=2.5, timestamp=1475678637)

In [9]:
test_ratings_df = tiled_ratings_df \
    .filter(F.col('tile') == 10) \
    .drop('tile') \
    .cache()

print(test_ratings_df.count())

test_ratings_df.head()

4286


Row(userId=9852, movieId=182823, rating=2.0, timestamp=1518461843)

### Metrics

In [10]:
def evaluate_recommendations_on(model, recommendations_map_fn, df):
    labels = df \
        .groupby('userId') \
        .agg(F.collect_set('movieId').alias('labels'))
    
    # print(labels.count())
    
    ATS = [1, 5, 10]
    MAX_ATS = max(ATS)
    
    user_recs = model \
        .recommendForUserSubset(labels, MAX_ATS)

    # print(user_recs.count())
    # print(user_recs.take(10))

    recs_and_labels = labels \
        .join(user_recs, 'userId') \
        .select('recommendations', 'labels') \
        .rdd \
        .map(lambda row: (list(map(recommendations_map_fn, row[0])), row[1]))
    
    # print(recs_and_labels.take(10))
    
    ranking_metrics = RankingMetrics(recs_and_labels)
    
    metrics_values = {}
    
    for N in ATS:
        precision_n_handle = "Precision@" + str(N)
        metrics_values[precision_n_handle] = ranking_metrics.precisionAt(N)
        ndcg_n_handle = "NDCG@" + str(N)
        metrics_values[ndcg_n_handle] = ranking_metrics.ndcgAt(N)
    metrics_values["MAP"] = ranking_metrics.meanAveragePrecision
    return metrics_values

In [11]:
def get_ate(groups, control_name) -> pd.DataFrame:
    """Get Average Treatment Effect
    groups - dictionary where keys - names of models, values - dicts of pairs <metric_name>, <metric_value>
    control_name - name of baseline model
    
    return pd.DataFrame (rows corresponds to metrics, cols corresponds to models and ATE with respect to control)
    """
    
    metric_names = []
    for metric_name_values in groups.values():
        for metric_name, _ in metric_name_values.items():
            if metric_name not in metric_names:
                metric_names.append(metric_name)
    metric_names = list(sorted(metric_names))
    
    if control_name not in groups:
        raise ValueError("Control experiment is not in the group.")
    control_values = groups[control_name]
    if len(control_values) != len(metric_names):
        raise ValueError("Control experiment does not have all the metrics computed.")

    model_names = list(sorted(groups.keys()))
    metric_model_ates = []
    for metric_name in metric_names:
        control_value = control_values[metric_name]
        model_ates = []
        for model_name in model_names:
            if metric_name in groups[model_name]:
                ate = (groups[model_name][metric_name] - control_value) / control_value * 100
            else:
                ate = None
            model_ates.append(ate)
        metric_model_ates.append(model_ates)

    ates_df = pd.DataFrame(data=metric_model_ates, index=metric_names, columns=model_names)
    return ates_df

In [12]:
all_metrics = {}

## Models

Теперь мы можем перейти к формулировке задачи в терминах машинного обучения.

Одна из формулировок, к которой мы сведем нашу задачу - **Matrix Completetion**. Данную задачу будем решать с помощью `ALS`

### [ALS](https://spark.apache.org/docs/latest/ml-collaborative-filtering.html#explicit-vs-implicit-feedback)

In [13]:
def get_baseline_als_space():
    space = {
        'rank': 10,
        'maxIter': 10,
        'regParam': 0.1,
        'implicitPrefs': False,
        'alpha': 1.0,
        'nonnegative': False,

        'numUserBlocks': 10,
        'numItemBlocks': 10,
        'userCol': 'userId',
        'itemCol': 'movieId',
        'ratingCol': 'rating',
        'seed': SEED,
        'coldStartStrategy': 'nan',
    }
    return space

In [14]:
als = ALS(**get_baseline_als_space())
baseline_als_model = als.fit(train_ratings_df)

In [15]:
for eval_df in [train_ratings_df, dev_ratings_df, test_ratings_df]:
    metrics = evaluate_recommendations_on(
        model=baseline_als_model,
        recommendations_map_fn=lambda rec: rec.movieId,
        df=eval_df)
    print(metrics)
    
BASELINE_HANDLE = '1.als_baseline'
all_metrics[BASELINE_HANDLE] = metrics

{'Precision@10': 0.013826084055295442, 'NDCG@10': 0.05717978758071188, 'Precision@5': 0.0160281148169979, 'Precision@1': 0.01654859481521848, 'MAP': 0.037278182675931026, 'NDCG@5': 0.041387595596643546, 'NDCG@1': 0.016548594815218476}
{'Precision@10': 0.0001141291942478886, 'NDCG@10': 0.0005310130572983582, 'Precision@5': 0.00018260671079662182, 'Precision@1': 0.00022825838849577717, 'MAP': 0.0003823328007304271, 'NDCG@5': 0.0004590055528962292, 'NDCG@1': 0.00022825838849577733}
{'Precision@10': 8.700696055684451e-05, 'NDCG@10': 0.00047300746913325357, 'Precision@5': 5.800464037122966e-05, 'Precision@1': 0.00029002320185614864, 'MAP': 0.0003625290023201855, 'NDCG@5': 0.0002900232018561487, 'NDCG@1': 0.00029002320185614815}


Покажите для выбранных вами фильмов топ-20 наиболее похожих фильмов

In [16]:
movies_df = read_df(MOVIES_PATH)
movies_df.count()

62423

In [17]:
def get_cosine_similarity(features_0, features_1):
    similarity = 1 - spatial.distance.cosine(features_0, features_1)
    return float(similarity)


cosine_similarity_udf = F.udf(get_cosine_similarity, FloatType())


def find_similar_to(movie_id, model, N=20):
    movie_factors = model.itemFactors
    
    # print(movie_factors.take(10))
    
    selected_movie_factors = movie_factors \
        .filter(F.col('id') == movie_id) \
        .selectExpr('id as movieId', 'features as movieFeatures') \
        .cache()

    # print(selected_movie_factors.collect())
    
    out = selected_movie_factors \
        .crossJoin(movie_factors) \
        .withColumn('sim', cosine_similarity_udf('movieFeatures', 'features')) \
        .select('movieId', 'id', 'sim') \
        .sort(F.col('sim').desc()) \
        .limit(N)
    
    return out

In [18]:
similar_movies = find_similar_to(1, baseline_als_model)
similar_movies.collect()

[Row(movieId=1, id=1, sim=1.0),
 Row(movieId=1, id=5247, sim=0.9805355668067932),
 Row(movieId=1, id=8822, sim=0.9742634296417236),
 Row(movieId=1, id=994, sim=0.9709194302558899),
 Row(movieId=1, id=1458, sim=0.9698500633239746),
 Row(movieId=1, id=129, sim=0.96310955286026),
 Row(movieId=1, id=116837, sim=0.9630640149116516),
 Row(movieId=1, id=5147, sim=0.9616589546203613),
 Row(movieId=1, id=33463, sim=0.9595453143119812),
 Row(movieId=1, id=8225, sim=0.9594824910163879),
 Row(movieId=1, id=5102, sim=0.9592540264129639),
 Row(movieId=1, id=95856, sim=0.9592218399047852),
 Row(movieId=1, id=72701, sim=0.9573711156845093),
 Row(movieId=1, id=54686, sim=0.9562532305717468),
 Row(movieId=1, id=7093, sim=0.9562276601791382),
 Row(movieId=1, id=34314, sim=0.9560893177986145),
 Row(movieId=1, id=85305, sim=0.9558175802230835),
 Row(movieId=1, id=89837, sim=0.9543654918670654),
 Row(movieId=1, id=96606, sim=0.9528065919876099),
 Row(movieId=1, id=55955, sim=0.9518386125564575)]

### The movie with the highest similarity 1.0 is the original movie, i.e. Toy Story (1995)

In [19]:
similar_movies \
    .join(movies_df, similar_movies['id'] == movies_df['movieId']) \
    .select('title', 'genres') \
    .collect()

[Row(title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(title='Pie in the Sky (1996)', genres='Comedy|Romance'),
 Row(title='Big Night (1996)', genres='Comedy|Drama'),
 Row(title='Touch (1997)', genres='Drama|Fantasy|Romance'),
 Row(title='Rookie of the Year (1993)', genres='Comedy|Fantasy'),
 Row(title='Wild Strawberries (Smultronstället) (1957)', genres='Drama'),
 Row(title='Smokey and the Bandit (1977)', genres='Action|Comedy'),
 Row(title='Front Page, The (1974)', genres='Comedy'),
 Row(title='Night of the Living Dead (1990)', genres='Horror'),
 Row(title="Unidentified Flying Oddball (a.k.a. Spaceman and King Arthur, The) (a.k.a. Spaceman in King Arthur's Court, A) (1979)", genres='Adventure|Comedy|Fantasy|Sci-Fi'),
 Row(title='DuckTales: The Movie - Treasure of the Lost Lamp (1990)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(title='Funny Ha Ha (2002)', genres='Comedy|Drama'),
 Row(title='Last Legion, The (2007)', genres='Action|

### Ваша формулировка

На лекции было еще несколько ML формулировок задачи рекомендаций. Выберете одну из них и реализуйте метод.

## Based on Evaluation of Session-based Recommendation Algorithms: https://arxiv.org/pdf/1803.09587.pdf

Found from reference [11] https://dl.acm.org/doi/10.1145/3298689.3347041

### Session-based Recommendation

### Simple Association Rules (AR)

The equation (1) in the paper. The baseline version will treat each user as a single session.

The normalizer in the equation (1) could be disregarded, since it only depends on the s, not the i, and used to treat the scores as probabilities. So the score function takes the form of:

score_{AR}(i, s) = \sum_{p\in S_p} \sum_{x=1}^{|p|} \sum_{y=1}^{|p|} 1_{EQ}(s_|s|, p_|x|) 1_{EQ}(i, p_|y|)

In [20]:
cooccurrence_df = train_ratings_df \
    .selectExpr('userId', 'movieId as movieId2') \
    .join(train_ratings_df, 'userId') \
    .select('movieId', 'movieId2') \
    .groupBy('movieId', 'movieId2') \
    .agg(F.count('movieId').alias('cooccurrence')) \
    .cache()

cooccurrence_df.take(10)

[Row(movieId=42734, movieId2=115569, cooccurrence=1),
 Row(movieId=68749, movieId2=6371, cooccurrence=1),
 Row(movieId=1241, movieId2=42734, cooccurrence=1),
 Row(movieId=110603, movieId2=7369, cooccurrence=1),
 Row(movieId=2983, movieId2=2983, cooccurrence=6),
 Row(movieId=1235, movieId2=441, cooccurrence=1),
 Row(movieId=254, movieId2=1449, cooccurrence=1),
 Row(movieId=3900, movieId2=205, cooccurrence=1),
 Row(movieId=7934, movieId2=6537, cooccurrence=1),
 Row(movieId=6250, movieId2=6250, cooccurrence=21)]

In [21]:
cooccurrence_df \
    .filter(F.col('movieId') == 54503) \
    .filter(F.col('movieId2') == 344) \
    .head()

Row(movieId=54503, movieId2=344, cooccurrence=2)

In [22]:
cooccurrence_df \
    .filter(F.col('movieId2') == 54503) \
    .filter(F.col('movieId') == 344) \
    .head()

Row(movieId=344, movieId2=54503, cooccurrence=2)

In [23]:
user_max_timestamp_df = train_ratings_df \
    .groupBy('userId') \
    .agg(F.max('timestamp').alias('maxTimestamp')) \
    .cache()

user_max_timestamp_df.take(10)

[Row(userId=1238, maxTimestamp=1495751585),
 Row(userId=1342, maxTimestamp=1429643758),
 Row(userId=1829, maxTimestamp=909021026),
 Row(userId=2122, maxTimestamp=1224769746),
 Row(userId=2142, maxTimestamp=1078330920),
 Row(userId=2659, maxTimestamp=1454173941),
 Row(userId=2866, maxTimestamp=936021626),
 Row(userId=3175, maxTimestamp=945925044),
 Row(userId=3749, maxTimestamp=843684095),
 Row(userId=3794, maxTimestamp=1484585985)]

In [24]:
user_last_movie_id_df = train_ratings_df \
    .join(user_max_timestamp_df, (
        (train_ratings_df['userId'] == user_max_timestamp_df['userId']) &
        (train_ratings_df['timestamp'] == user_max_timestamp_df['maxTimestamp']))) \
    .select(train_ratings_df['userId'], train_ratings_df['movieId'].alias('lastMovieId')) \
    .cache()

user_last_movie_id_df.take(10)

[Row(userId=1238, lastMovieId=4011),
 Row(userId=1342, lastMovieId=33171),
 Row(userId=1829, lastMovieId=1731),
 Row(userId=2122, lastMovieId=4993),
 Row(userId=2142, lastMovieId=3081),
 Row(userId=2659, lastMovieId=4993),
 Row(userId=2866, lastMovieId=2701),
 Row(userId=3175, lastMovieId=349),
 Row(userId=3749, lastMovieId=185),
 Row(userId=3794, lastMovieId=1485)]

In [30]:
class AssosiationsRuleModel(object):

    def __init__(self, user_last_movie_id_df, cooccurrence_df):
        self.user_last_movie_id_df = user_last_movie_id_df
        self.cooccurrence_df = cooccurrence_df
    
    def recommendForUserSubset(self, users_df, N):
        def postprocess_recommendations(recommendations):
            # Discard the first recommendation since it is always the last movie in the user session from
            # the fact that it has the highest co-occurrence with itself.
            # We would probably want to discard all movies we have seen in the training ratings set for the user,
            # but I do not think it holds for general session-based recommendation problems.
            recommendations = recommendations[1:]
            # Limit the size up to N.
            recommendations = recommendations[:N]
            return recommendations

        postprocess_recommendations_udf = F.udf(postprocess_recommendations, ArrayType(IntegerType()))
        
        w = Window.partitionBy('userId').orderBy(F.col('cooccurrence').desc())
        
        subset_user_last_movie_id_df = self.user_last_movie_id_df \
            .join(users_df, 'userId') \
            .select('userId', 'lastMovieId')
        
        user_raw_recommendations_df = subset_user_last_movie_id_df \
            .join(self.cooccurrence_df, 
                  subset_user_last_movie_id_df['lastMovieId'] == self.cooccurrence_df['movieId']) \
            .withColumn('rawRecommendations', F.collect_list('movieId2').over(w)) \
            .groupBy('userId') \
            .agg(F.max('rawRecommendations').alias('rawRecommendations')) \
            .select('userId', 'rawRecommendations')
        
        user_recommendations_df = user_raw_recommendations_df \
            .withColumn('recommendations', postprocess_recommendations_udf('rawRecommendations')) \
            .select('userId', 'recommendations')

        return user_recommendations_df

In [31]:
assosiations_rule_model = AssosiationsRuleModel(user_last_movie_id_df, cooccurrence_df)

In [32]:
dev_labels = dev_ratings_df \
    .groupby('userId') \
    .agg(F.collect_set('movieId').alias('labels'))

assosiations_rule_model \
    .recommendForUserSubset(dev_labels, 10) \
    .take(10)

[Row(userId=9852, recommendations=[115569, 6156, 1094, 4040, 2236, 54999, 1287, 111, 4956, 74754]),
 Row(userId=15727, recommendations=[4069, 858, 919, 160, 5266, 2423, 593, 3755, 1120, 555]),
 Row(userId=16574, recommendations=[2857, 766, 3169, 36523, 965, 610, 858, 2294, 930, 945]),
 Row(userId=26087, recommendations=[450, 3578, 2943, 6957, 1263, 1095, 26774, 2921, 1295, 26505]),
 Row(userId=29054, recommendations=[1923, 8636, 7934, 86286, 8371, 79357, 2087, 1644, 81932, 61240]),
 Row(userId=32855, recommendations=[4573, 5609, 1897, 2762, 5685, 223, 2795, 163, 5670, 5049]),
 Row(userId=38311, recommendations=[2762, 60126, 364, 778, 3481, 1407, 2953, 597, 8132, 2478]),
 Row(userId=63155, recommendations=[5333, 2287, 51709, 26472, 6548, 417, 4280, 5, 532, 2005]),
 Row(userId=67861, recommendations=[106782, 2571, 6863, 41997, 60074, 3461, 7117, 1248, 5449, 120635]),
 Row(userId=74166, recommendations=[37720, 783, 1324, 8132, 1682, 8818, 588, 916, 571, 4246])]

In [33]:
for eval_df in [train_ratings_df, dev_ratings_df, test_ratings_df]:
    metrics = evaluate_recommendations_on(
        model=assosiations_rule_model,
        recommendations_map_fn=lambda rec: rec,
        df=eval_df)
    print(metrics)
    
ASSOSIATIONS_RULE_MODEL_HANDLE = '2.assosiations_rule_model'
all_metrics[ASSOSIATIONS_RULE_MODEL_HANDLE] = metrics

{'Precision@10': 0.0265177886272896, 'NDCG@10': 0.04880000870236168, 'Precision@5': 0.03246327168388626, 'Precision@1': 0.04582003403138452, 'MAP': 0.028668608080256733, 'NDCG@5': 0.04221954818256252, 'NDCG@1': 0.04582003403138452}
{'Precision@10': 0.00031956174389408815, 'NDCG@10': 0.0013589007601605663, 'Precision@5': 0.00031956174389408825, 'Precision@1': 0.00022825838849577763, 'MAP': 0.0008324184882048386, 'NDCG@5': 0.0008374123466361861, 'NDCG@1': 0.00022825838849577768}
{'Precision@10': 0.00043503480278422267, 'NDCG@10': 0.0024580978252383458, 'Precision@5': 0.0006960556844547569, 'Precision@1': 0.0008700696055684461, 'MAP': 0.0019109306522299544, 'NDCG@5': 0.002199650952371221, 'NDCG@1': 0.000870069605568446}


## Hyperparameter optimization

In [34]:
!pip3.5 install hyperopt



In [35]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [36]:
def als_objective(space):
    estimator = ALS(**space)
    print('SPACE: ' + str(space))
    success = False
    attempts = 0
    model = None
    while not success and attempts < 2:
        try:
            model = estimator.fit(train_ratings_df)
            success = True
        except Exception as e:
            attempts += 1
            print(e)
            print('Try again')
        
    dev_metrics = evaluate_recommendations_on(
        model=model,
        recommendations_map_fn=lambda rec: rec.movieId,
        df=dev_ratings_df)    
    print('METRICS: ' + str(dev_metrics))
    
    trial_status_dict = {
        'loss': -dev_metrics['MAP'],
        'status': STATUS_OK,
    }
    return trial_status_dict

In [37]:
def tune_als_paired_params(space, param1_name, param1_choice, param2_name, param2_choice):
    assert param1_name in space
    assert param2_name in space

    space[param1_name] = hp.choice(param1_name, param1_choice)
    space[param2_name] = hp.choice(param2_name, param2_choice)

    trials = Trials()
    best_choices = fmin(fn=als_objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=len(param1_choice)*len(param2_choice),
                        trials=trials)

    space[param1_name] = param1_choice[best_choices[param1_name]]
    space[param2_name] = param2_choice[best_choices[param2_name]]
    
    return space

In [38]:
space = get_baseline_als_space()

In [39]:
%%time

tune_als_paired_params(space, 'implicitPrefs', [True, False], 'alpha', [0.75, 1.0, 1.25, 1.5])

SPACE: {'itemCol': 'movieId', 'ratingCol': 'rating', 'regParam': 0.1, 'userCol': 'userId', 'numItemBlocks': 10, 'coldStartStrategy': 'nan', 'numUserBlocks': 10, 'implicitPrefs': True, 'rank': 10, 'seed': 90, 'maxIter': 10, 'alpha': 1.25, 'nonnegative': False}
METRICS: {'Precision@10': 0.0002739100661949326, 'NDCG@10': 0.00104311753961057, 'Precision@5': 0.000273910066194933, 'Precision@1': 0.0, 'MAP': 0.0005664793498621388, 'NDCG@5': 0.0006571868698819413, 'NDCG@1': 0.0}
SPACE: {'itemCol': 'movieId', 'ratingCol': 'rating', 'regParam': 0.1, 'userCol': 'userId', 'numItemBlocks': 10, 'coldStartStrategy': 'nan', 'numUserBlocks': 10, 'implicitPrefs': True, 'rank': 10, 'seed': 90, 'maxIter': 10, 'alpha': 0.75, 'nonnegative': False}
METRICS: {'Precision@10': 0.0001826067107966219, 'NDCG@10': 0.0008468902088005056, 'Precision@5': 0.0001826067107966219, 'Precision@1': 0.00022825838849577725, 'MAP': 0.0005598671028938094, 'NDCG@5': 0.0005588813327732005, 'NDCG@1': 0.00022825838849577728}
SPACE: 

{'alpha': 1.0,
 'coldStartStrategy': 'nan',
 'implicitPrefs': True,
 'itemCol': 'movieId',
 'maxIter': 10,
 'nonnegative': False,
 'numItemBlocks': 10,
 'numUserBlocks': 10,
 'rank': 10,
 'ratingCol': 'rating',
 'regParam': 0.1,
 'seed': 90,
 'userCol': 'userId'}

In [40]:
%%time

tune_als_paired_params(space, 'rank', [8, 10, 12, 15], 'nonnegative', [True, False])

SPACE: {'itemCol': 'movieId', 'ratingCol': 'rating', 'regParam': 0.1, 'userCol': 'userId', 'numItemBlocks': 10, 'coldStartStrategy': 'nan', 'numUserBlocks': 10, 'implicitPrefs': True, 'rank': 10, 'seed': 90, 'maxIter': 10, 'alpha': 1.0, 'nonnegative': True}
METRICS: {'Precision@10': 0.0003652134215932437, 'NDCG@10': 0.0013701094581588667, 'Precision@5': 0.00036521342159324374, 'Precision@1': 0.0004565167769915547, 'MAP': 0.000787400861584838, 'NDCG@5': 0.0008050741641001643, 'NDCG@1': 0.0004565167769915549}
SPACE: {'itemCol': 'movieId', 'ratingCol': 'rating', 'regParam': 0.1, 'userCol': 'userId', 'numItemBlocks': 10, 'coldStartStrategy': 'nan', 'numUserBlocks': 10, 'implicitPrefs': True, 'rank': 8, 'seed': 90, 'maxIter': 10, 'alpha': 1.0, 'nonnegative': False}
METRICS: {'Precision@10': 0.000342387582743666, 'NDCG@10': 0.0014411517838397697, 'Precision@5': 0.0005021684546907097, 'Precision@1': 0.00022825838849577722, 'MAP': 0.0008984503791625451, 'NDCG@5': 0.0011786680252591298, 'NDCG@1

{'alpha': 1.0,
 'coldStartStrategy': 'nan',
 'implicitPrefs': True,
 'itemCol': 'movieId',
 'maxIter': 10,
 'nonnegative': False,
 'numItemBlocks': 10,
 'numUserBlocks': 10,
 'rank': 8,
 'ratingCol': 'rating',
 'regParam': 0.1,
 'seed': 90,
 'userCol': 'userId'}

In [41]:
%%time

tune_als_paired_params(space, 'maxIter', [8, 10, 12], 'regParam', [1e-2, 1e-1, 1e-0])

SPACE: {'itemCol': 'movieId', 'ratingCol': 'rating', 'regParam': 0.01, 'userCol': 'userId', 'numItemBlocks': 10, 'coldStartStrategy': 'nan', 'numUserBlocks': 10, 'implicitPrefs': True, 'rank': 8, 'seed': 90, 'maxIter': 10, 'alpha': 1.0, 'nonnegative': False}
METRICS: {'Precision@10': 0.00036521342159324374, 'NDCG@10': 0.0015859450729445881, 'Precision@5': 0.0004565167769915548, 'Precision@1': 0.00045651677699155407, 'MAP': 0.0010302424249011785, 'NDCG@5': 0.001208769037279045, 'NDCG@1': 0.0004565167769915545}
SPACE: {'itemCol': 'movieId', 'ratingCol': 'rating', 'regParam': 1.0, 'userCol': 'userId', 'numItemBlocks': 10, 'coldStartStrategy': 'nan', 'numUserBlocks': 10, 'implicitPrefs': True, 'rank': 8, 'seed': 90, 'maxIter': 12, 'alpha': 1.0, 'nonnegative': False}
METRICS: {'Precision@10': 0.000410865099292399, 'NDCG@10': 0.0017359476109298946, 'Precision@5': 0.0004565167769915542, 'Precision@1': 0.0004565167769915544, 'MAP': 0.0010892091752625877, 'NDCG@5': 0.0011646058678419083, 'NDCG@

{'alpha': 1.0,
 'coldStartStrategy': 'nan',
 'implicitPrefs': True,
 'itemCol': 'movieId',
 'maxIter': 12,
 'nonnegative': False,
 'numItemBlocks': 10,
 'numUserBlocks': 10,
 'rank': 8,
 'ratingCol': 'rating',
 'regParam': 1.0,
 'seed': 90,
 'userCol': 'userId'}

In [42]:
als = ALS(**space)
tuned_als_model = als.fit(train_ratings_df)

In [43]:
for eval_df in [train_ratings_df, dev_ratings_df, test_ratings_df]:
    metrics = evaluate_recommendations_on(
        model=tuned_als_model,
        recommendations_map_fn=lambda rec: rec.movieId,
        df=eval_df)
    print(metrics)
    
TUNED_ALS_HANDLE = '3.als_tuned'
all_metrics[TUNED_ALS_HANDLE] = metrics

{'Precision@10': 0.03666381218234593, 'NDCG@10': 0.16977406574442344, 'Precision@5': 0.060798291757954606, 'Precision@1': 0.19885005060222208, 'MAP': 0.1344471993026565, 'NDCG@5': 0.16138844908134362, 'NDCG@1': 0.1988500506022222}
{'Precision@10': 0.0004108650992923991, 'NDCG@10': 0.0017359476109298948, 'Precision@5': 0.0004565167769915546, 'Precision@1': 0.000456516776991554, 'MAP': 0.001089209175262587, 'NDCG@5': 0.0011646058678419083, 'NDCG@1': 0.000456516776991555}
{'Precision@10': 0.00040603248259860807, 'NDCG@10': 0.002173119590180444, 'Precision@5': 0.0005800464037122969, 'Precision@1': 0.0008700696055684457, 'MAP': 0.0016051403159871845, 'NDCG@5': 0.0018172820429777632, 'NDCG@1': 0.0008700696055684457}


## Evaluation Results

Сравните реализованные методы с помощью выбранных метрик. Не забывайте про оптимизацию гиперпараметров.

In [44]:
get_ate(all_metrics, BASELINE_HANDLE)

Unnamed: 0,1.als_baseline,2.assosiations_rule_model,3.als_tuned
MAP,0.0,427.111111,342.761905
NDCG@1,0.0,200.0,200.0
NDCG@10,0.0,419.674209,359.426062
NDCG@5,0.0,658.439648,526.598848
Precision@1,0.0,200.0,200.0
Precision@10,0.0,400.0,366.666667
Precision@5,0.0,1100.0,900.0
