# Details about the notebook:
* **Introduction:** The notebook implements a basic recommendation system using the cosine similarity and concepts of linear algebra
* **Load:** The first step is to import the required libraries and to feed the data

* **Functions:** Next we program our mathematical functions to calculate the cosine similarity in order to make recommendations based on the fed data.

* **Results:** On running the code we see the recommendations for a particular user.




In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

[]


In [2]:
from typing import List
import math, random
from collections import defaultdict, Counter

Vector = List[float]
def dot(v: Vector, w: Vector) -> float:
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    assert len(v) == len(w), "vectors must be same length"

    return sum(v_i * w_i for v_i, w_i in zip(v, w))

In [3]:
users_interests = [
    [" Hereditary", "  The Favourite", "Blindspotting", " Spider-Man: Into the Spider-Verse", "Mission: Impossible - Fallout", " Sorry to Bother You", "Roma"],
    ["Annihilation", "Avengers: Infinity War", "Roma", "Blindspotting", " Under the Silver Lake"],
    ["Upgrade", "wrong turn", "No Mercy", "La la Land", "bahubaali 2", "BPM"],
    ["Whose Streets? ", "Upgrade", "bahubaali", "Nocturama", "demons"],
    ["manhunt", "Nocturama", "River", "Brawl in Cell Block 99 "],
    ["Upgrade", "Whose Streets? ", " Spider-Man: Into the Spider-Verse", "Hangover", " Hangover 2", "Leech"],
    ["bahubaali", "demons", "The man who knew infinity", "The Big Sick"],
    ["manhunt", "wrong turn", "abcd", "Pale"],
    ["Pale", "deep learning", "  The Favourite", "Lame"],
    [" Hereditary", " Spider-Man: Into the Spider-Verse", "I", "  The Favourite"],
    ["bahubaali", "Whose Streets? ", "bahubaali 2"],
    ["Hangover", "deep learning", "Lame", "demons"],
    ["BPM", "Whose Streets? ", "Upgrade"],
    ["Guardians of the Galaxy Vol. 2 ", "Blindspotting", " Under the Silver Lake", "Columbus", "Avengers: Infinity War"],
    ["Brawl in Cell Block 99 ", "Nocturama", "The Post "]
]

Now, that we have entered the data we will find the most popular movies. We calculate the most popular interests and then the 'most popular new interests' on the basis of the data using frequency analysis.

In [4]:
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()

print("___________Popular Interests :________________")
print(popular_interests)

def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency)
                   for interest, frequency in popular_interests
                   if interest not in user_interests]
    return suggestions[:max_results]

print("___________Most Popular New Interests___________")
print("already like:", ["Annihilation", "Avengers: Infinity War", "Roma", "Blindspotting", " Under the Silver Lake"])
print(most_popular_new_interests(["Annihilation", "Avengers: Infinity War", "Roma", "Blindspotting", " Under the Silver Lake"]))
print()
print("___________already like:___________", ["Whose Streets? ", "Upgrade", "bahubaali", "Nocturama", "demons"])
print(most_popular_new_interests(["Whose Streets? ", "Upgrade", "bahubaali", "Nocturama", "demons"]))
print()


___________Popular Interests :________________
[('Upgrade', 4), ('Whose Streets? ', 4), ('  The Favourite', 3), ('Blindspotting', 3), (' Spider-Man: Into the Spider-Verse', 3), ('bahubaali', 3), ('Nocturama', 3), ('demons', 3), (' Hereditary', 2), ('Roma', 2), ('Avengers: Infinity War', 2), (' Under the Silver Lake', 2), ('wrong turn', 2), ('bahubaali 2', 2), ('BPM', 2), ('manhunt', 2), ('Brawl in Cell Block 99 ', 2), ('Hangover', 2), ('Pale', 2), ('deep learning', 2), ('Lame', 2), ('Mission: Impossible - Fallout', 1), (' Sorry to Bother You', 1), ('Annihilation', 1), ('No Mercy', 1), ('La la Land', 1), ('River', 1), (' Hangover 2', 1), ('Leech', 1), ('The man who knew infinity', 1), ('The Big Sick', 1), ('abcd', 1), ('I', 1), ('Guardians of the Galaxy Vol. 2 ', 1), ('Columbus', 1), ('The Post ', 1)]
___________Most Popular New Interests___________
already like: ['Annihilation', 'Avengers: Infinity War', 'Roma', 'Blindspotting', ' Under the Silver Lake']
[('Upgrade', 4), ('Whose Street

In [5]:
#
# user-based filtering
#

def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

unique_interests = sorted(list({ interest
                                 for user_interests in users_interests
                                 for interest in user_interests }))

def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

user_interest_matrix = list(map(make_user_interest_vector, users_interests))

user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero
             if user_id != other_user_id and similarity > 0]  # similarity

    return sorted(pairs,                                      # sort them
                  key=lambda pair: pair[1],                   # most similar
                  reverse=True)                               # first


def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity

    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)

    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

In [6]:
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,
                  key=lambda pair: pair[1],
                  reverse=True)

def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity

    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)

    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

In [7]:

    print("User based similarity")
    print("most similar to 0")
    print(most_similar_users_to(0))

    print("Suggestions for 0")
    print(user_based_suggestions(0))
    print()

    print("Item based similarity")
    print("most similar to '  The Favourite'")
    print(most_similar_interests_to(0))
    print()

    print("suggestions for user 0")
    print(item_based_suggestions(0))

User based similarity
most similar to 0
[(9, 0.5669467095138409), (1, 0.3380617018914066), (8, 0.1889822365046136), (13, 0.1690308509457033), (5, 0.1543033499620919)]
Suggestions for 0
[('I', 0.5669467095138409), ('Avengers: Infinity War', 0.50709255283711), (' Under the Silver Lake', 0.50709255283711), ('Annihilation', 0.3380617018914066), ('Pale', 0.1889822365046136), ('deep learning', 0.1889822365046136), ('Lame', 0.1889822365046136), ('Guardians of the Galaxy Vol. 2 ', 0.1690308509457033), ('Columbus', 0.1690308509457033), ('Upgrade', 0.1543033499620919), ('Whose Streets? ', 0.1543033499620919), ('Hangover', 0.1543033499620919), (' Hangover 2', 0.1543033499620919), ('Leech', 0.1543033499620919)]

Item based similarity
most similar to '  The Favourite'
[(' Hereditary', 0.8164965809277261), (' Spider-Man: Into the Spider-Verse', 0.6666666666666666), (' Sorry to Bother You', 0.5773502691896258), ('I', 0.5773502691896258), ('Mission: Impossible - Fallout', 0.5773502691896258), ('Lame',