### <span style='font-family:Georgia'> The purpose
Recommending the most popular movies among similar users that the user did not watch.

### <span style='font-family:Georgia'> The method
Compute the similarity between the user identified with the `user_id` given while initializing the object (passed to the constructor) and every other user.
Then predict the ratings of movies. The ratings are calculated using a weighted average of the ratings of films by other users, where the weights are equal to the similarity of two users. The similarity is computed based on the pearson correlation scaled to $[0,1]$ interval.

In [None]:
import os
import re
from collections import Counter
from typing import List, Dict
from tqdm.notebook import tqdm

import pandas as pd
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
import random

random.seed(10)

In [None]:
@staticmethod
def transform_data_to_dict(data: List[str]) -> Dict[int, Dict]:
    """Function for transforming a line of input text data to a dict"""
    _data = [l.split(",") for l in data]
    return {int(movie_id): int(rating) for movie_id, rating, date in _data}

def extract_features(user_id) -> Dict[int, Dict]:  # MODIFIED
    """Extract movies, ratings and dates for this specific user"""
    data: List[str] = []
    condition = False
    for fname in PATHS:
        with open(fname) as f:
            for line in f:
                l = line.strip()
                if re.match(r"\d+:", line):
                    movie_id = line.split(":")[0]                        
                else:
                    __user_id = line.split(",")[0]                        
                    if int(__user_id) == int(user_id):
                        l = l.replace(__user_id, movie_id)
                        data.append(l)
    return __transform_data_to_dict(data)


In [None]:
class RecommenderSystem:
    def __init__(self, user_id: int, n_similar: int, min_common: int):
        self.user_id = user_id
        self.n_similar = n_similar
        self.min_common = min_common
        self.features = extract_features(user_id)
        self.similar_users = []

    def compute_users_similarity(
        self, user1_preferences: Dict[int, Dict], user2_preferences: Dict[int, Dict]
    ) -> float:
        """Compute the similarity based on the correlation between ratings of movies"""
        common = set(user1_preferences.keys()).intersection(user2_preferences.keys())
        common = [x for x in common if x < LIMIT_MOVIES_CNT]
        if len(common) < self.min_common:
            return 0.0
        intersection1 = [user1_preferences[k] for k in common]
        intersection2 = [user2_preferences[k] for k in common]
        return float((pearsonr(intersection1, intersection2)[0] + 1) / 2)

    def get_similar_users(self) -> Dict:  # modified
        """
        Function for:
            1. parsing the file line by line,
            2. computing pairwise similarity between users,
            3. if high, saving the user, similarity score and preferences
        """
        considered_users = []

        for file_name in PATHS:
            with open(file_name) as f:
                for line in f:
                    line = line.strip()
                    if re.match(r"\d+:", line):
                        pass
                    else:
                        __user_id = int(line.split(",")[0])
                        if __user_id in considered_users: continue
                        else:
                            considered_users.append(int(__user_id))
                            features = extract_features(user_id)
                            similarity = self.compute_users_similarity(
                                self.features, features
                            )
                            self.similar_users += [
                                {
                                    "user": highly_similar_user_id,
                                    "similarity": similarity,
                                    "movies": features,
                                }
                            ]
                            if len(considered_users) > LIMIT_USERS_CNT:
                                return self.similar_users

        return self.similar_users

    def recommend(self) -> Dict:
        """
        Function recommending the most popular movies
        among similar users that the user did not watch.
        """
        similar_users = self.get_similar_users()
        weighted_avg = {}
        for preferences in tqdm(similar_users):
            for movie_id, rating in preferences["movies"].items():
                if movie_id not in weighted_avg:
                    weighted_avg[movie_id] = [0] * 3  # numerator, denominator, count
                weighted_avg[movie_id][0] += rating * preferences["similarity"]
                weighted_avg[movie_id][1] += preferences["similarity"]
                weighted_avg[movie_id][2] += 1
        result = {
            movie_id: value[0] / value[1] if value[0] > 1e-13 else 0
            for movie_id, value in weighted_avg.items()
            if value[2] > self.min_common
        }
        return result


In [None]:
USER_ID = 1248029
N_SIMILAR = 5
MIN_COMMON = 10
PATHS = [
    os.path.join("./data/netflix", "combined_data_{}.txt".format(i + 1))
    for i in list(range(4))
]

LIMIT_USERS_CNT = 500  # limit for shortening calculation time (modifiable parameter)
LIMIT_MOVIES_CNT = float(
    "inf"
)  # limit for shortening calculation time (modifiable parameter, here: no limit)


In [None]:
PATHS

In [None]:
# Test

# initialize the RecommenderSystem instance
r = RecommenderSystem(user_id=USER_ID, n_similar=N_SIMILAR, min_common=MIN_COMMON)

# get a list of movies and a the ratings for the user given by USER_ID
user_features = list(r.features.keys())

In [None]:
# split the dictionary into test and train subsets, for the further evaluation
train, test = train_test_split(user_features)
test_features = {k: v for k, v in r.features.items() if k in test}
r.features = {k: v for k, v in r.features.items() if k in train}

# calculate the recommendation
recommended = r.recommend()
recommended_series = pd.DataFrame(
    recommended.items(), columns=["user", "recommendation"]
).sort_values("recommendation", ascending=False)

# evaluate (compare the recommendation with actual user's preferences from test set)
recommended_series["test_rating"] = recommended_series["movie"].map(test_features)
recommended_series[~recommended_series["test_rating"].isna()]
