# Dit hebben we gebruikt om vooral overzichtelijk dingen te kunnen proberen en hebben we voor de zekerheid toegevoegd

In [1]:
# %load data.py

import pandas as pd


"""
This file loads the data from the data directory and shows you how.
Feel free to change the contents of this file!
Do ensure these functions remain functional:
    - get_business(city, business_id)
    - get_reviews(city, business_id=None, user_id=None, n=10)
    - get_user(username)
"""

import os
import json
import random

DATA_DIR = "data"


def load_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a dictionary of the form:
        {
            <city1>: [<entry1>, <entry2>, ...],
            <city2>: [<entry1>, <entry2>, ...],
            ...
        }
    """
    data = {}
    for city in cities:
        city_data = []
        with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
            for line in f:
                city_data.append(json.loads(line))
        data[city] = city_data
    return data


def get_business(city, business_id):
    """
    Given a city name and a business id, return that business's data.
    Returns a dictionary of the form:
        {
            name:str,
            business_id:str,
            stars:str,
            ...
        }
    """
    for business in BUSINESSES[city]:
        if business["business_id"] == business_id:
            return business
    raise IndexError(f"invalid business_id {business_id}")


def get_reviews(city, business_id=None, user_id=None, n=10):
    """
    Given a city name and optionally a business id and/or auser id,
    return n reviews for that business/user combo in that city.
    Returns a dictionary of the form:
        {
            text:str,
            stars:str,
            ...
        }
    """
    def should_keep(review):
        if business_id and review["business_id"] != business_id:
            return False
        if user_id and review["user_id"] != user_id:
            return False
        return True

    reviews = REVIEWS[city]
    reviews = [review for review in reviews if should_keep(review)]
    return random.sample(reviews, min(n, len(reviews)))


def get_user(username):
    """
    Get a user by its username
    Returns a dictionary of the form:
        {
            user_id:str,
            name:str,
            ...
        }
    """
    for city, users in USERS.items():
        for user in users:
            if user["name"] == username:
                return user
    raise IndexError(f"invalid username {username}")


CITIES = load_cities()
CITIES = [city for city in CITIES if city != '.DS_Store']
USERS = load(CITIES, "user")
BUSINESSES = load(CITIES, "business")
REVIEWS = load(CITIES, "review")
TIPS = load(CITIES, "tip")
CHECKINS = load(CITIES, "checkin") 

In [2]:
def extract_genres(movies):
    """Create an unfolded genre dataframe. Unpacks genres seprated by a '|' into seperate rows.

    Arguments:
    movies -- a dataFrame containing at least the columns 'movieId' and 'genres' 
              where genres are seprated by '|'
    """
    genres_m = movies.apply(lambda row: pd.Series([row['business_id']] + row['categories'].lower().split(", ")), axis=1)
    stack_genres = genres_m.set_index(0).stack()
    df_stack_genres = stack_genres.to_frame()
    df_stack_genres['business_id'] = stack_genres.index.droplevel(1)
    df_stack_genres.columns = ['categorie', 'business_id']
    return df_stack_genres.reset_index()[['business_id', 'categorie']]




In [3]:
def la_place(frame):
    
    
    # We creeren een score per uniek business ID
    returnframe = frame["business_id"].unique()
    
    business_ratings = [] 
    
    # sla de business id op met hoe vaak elke score bij deze business voorkomt.
    for row in frame.groupby('business_id'):
        business_ratings.append((row[0], row[1]["stars"].value_counts()))
    
    
    finalscore = []
    
    # voor elke business
    
    for item in range(len(business_ratings)):
    
        scores = []
        
        # bereken voor elke value in de ratings van de business (5, 4, 3, 2, 1) zijn individuele la place
        
        for value in business_ratings[item][1].index:

            probability = value*((business_ratings[item][1][value]+1)/(sum(business_ratings[item][1])+5))
            scores.append(probability)
        
        # de score voor het bedrijf is dus deze losse termen gesommeerd. 
        
        score = sum(scores)

        # sla de business met zijn la place score op.  
        finalscore.append([business_ratings[item][0], score])
    
    df_laplace = pd.DataFrame(finalscore)
    df_laplace.columns = ['business_id', 'lapscore']
    return df_laplace

In [29]:

def pivot_genres(df):
    """Create a one-hot encoded matrix for genres.
    
    Arguments:
    df -- a dataFrame containing at least the columns 'movieId' and 'genre'
    
    Output:
    a matrix containing '0' or '1' in each cell.
    1: the movie has the genre
    0: the movie does not have the genre
    """
    return df.pivot_table(index = 'business_id', columns = 'categorie', aggfunc = 'size', fill_value=0)


In [None]:
def select_neighborhood(similarity_matrix, utility_matrix, target_business):
    """selects all items with similarity > 0"""
    similar = list(similarity_matrix[similarity_matrix[target_business] > 0].index)
    return similarity_matrix[target_business]


bus = 'zh_AIXt_wELJJLPfTmJcPw'



topneighborhood = select_neighborhood(df_similarity_categories, df_bus_utility, bus).drop(bus).sort_values(ascending=False)[:10].index

gesorteerde_topneighborhood = list(df_bus[df_bus['business_id'].isin(topneighborhood)].sort_values(by = 'stars', ascending = False).transpose().to_dict().values())


# Deel voor het testen van ons systeem

In [31]:
import sklearn.metrics.pairwise as pw
import pandas as pd
import numpy as np

In [32]:
def number_of_movies(ratings):
    """Determine the number of unique movie id's in the data.
    
    Arguments:
    ratings -- a dataFrame containing a column 'movieId'
    """
    return len(ratings['movieId'].unique())

def number_of_users(ratings):
    """Determine the number of unique user id's in the data.    
    
    Arguments:
    ratings -- a dataFrame containing a column 'userId'
    """
    return len(ratings['userId'].nique())

def number_of_ratings(ratings):
    """Count the number of ratings of a dataset.
    
    Arguments:
    ratings -- a dataFrame.
    """
    return ratings.shape[0]

def rating_density(ratings):
    """Compute the ratings given a dataset.
    
    Arguments:
    ratings -- a dataFrame contasining the columns 'userId' and 'movieId'
    """
    return number_of_ratings(ratings) / (number_of_movies(ratings) * number_of_users(ratings))

def split_data(data, d = 0.75):
    """Split data in a training and test set.
    
    Arguments:
    data -- any dataFrame.
    d    -- the fraction of data in the training set
    """
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    return data[mask_test], data[~mask_test]

In [33]:
list_of_frames = []
for city in REVIEWS:
    frame = pd.DataFrame(REVIEWS[city])
    list_of_frames.append(frame)

df_totaal_rev = pd.concat(list_of_frames)
df_new = df_totaal_rev.groupby(['user_id']).size()
df_new = df_new[df_new > 10]
lijst_veel_reviews_users = list(df_new.index)
df_met_veel_reviews = df_totaal_rev[df_totaal_rev['user_id'].isin(lijst_veel_reviews_users)]
datapath = "ml-latest-small"

df_werkbaar = df_met_veel_reviews.drop(columns=['cool', 'funny', 'useful','text','review_id','date'])
df_werkbaar = df_werkbaar.drop_duplicates(subset=['business_id', 'user_id'], keep='first')

df_ratings_training, df_ratings_test = split_data(df_werkbaar, d=0.9)

In [34]:
def pivot_ratings(df):
    """Creates a utility matrix for user ratings for movies
    
    Arguments:
    df -- a dataFrame containing at least the columns 'movieId' and 'genres'
    
    Output:
    a matrix containing a rating in each cell. np.nan means that the user did not rate the movie
    """
    return df.pivot(values='stars', columns='user_id', index='business_id')

def create_similarity_matrix_cosine(matrix):
    """Creates a adjusted(/soft) cosine similarity matrix.
    
    Arguments:
    matrix -- a utility matrix
    
    Notes:
    Missing values are set to 0. This is technically not a 100% correct, but is more convenient 
    for computation and does not have a big effect on the outcome.
    """
    mc_matrix = matrix - matrix.mean(axis = 0)
    return pd.DataFrame(pw.cosine_similarity(mc_matrix.fillna(0)), index = matrix.index, columns = matrix.index)

In [35]:
df_utility_ratings = pivot_ratings(df_ratings_training)
df_similarity_ratings = create_similarity_matrix_cosine(df_utility_ratings)

In [36]:
def predict_ratings(similarity, utility, to_predict):
    """Predicts the predicted rating for the input test data.
    
    Arguments:
    similarity -- a dataFrame that describes the similarity between items
    utility    -- a dataFrame that contains a rating for each user (columns) and each movie (rows). 
                  If a user did not rate an item the value np.nan is assumed. 
    to_predict -- A dataFrame containing at least the columns movieId and userId for which to do the predictions
    """
    # copy input (don't overwrite)
    ratings_test_c = to_predict.copy()
    # apply prediction to each row
    ratings_test_c['predicted rating'] = to_predict.apply(lambda row: predict_ids(similarity, utility, row['user_id'], row['business_id']), axis=1)
    return ratings_test_c

### Helper functions for predict_ratings_item_based ###

def predict_ids(similarity, utility, userId, itemId):
    # select right series from matrices and compute
    if userId in utility.columns and itemId in similarity.index:
        return predict_vectors(utility.loc[:,userId], similarity[itemId])
    return 0

def predict_vectors(user_ratings, similarities):
    # select only movies actually rated by user
    relevant_ratings = user_ratings.dropna()
    
    # select corresponding similairties
    similarities_s = similarities[relevant_ratings.index]
    
    # select neighborhood
    similarities_s = similarities_s[similarities_s > 0.0]
    relevant_ratings = relevant_ratings[similarities_s.index]
    
    # if there's nothing left return a prediction of 0
    norm = similarities_s.sum()
    if(norm == 0):
        return 0
    
    # compute a weighted average (i.e. neighborhood is all) 
    return np.dot(relevant_ratings, similarities_s)/norm

In [37]:
def mse(predicted_ratings):
    """Computes the mean square error between actual ratings and predicted ratings
    
    Arguments:
    predicted_ratings -- a dataFrame containing the columns rating and predicted rating
    """
    diff = predicted_ratings['stars'] - predicted_ratings['predicted rating']
    return (diff**2).mean()

# Utility matrix maken op basis van genres

In [38]:
list_of_frames_businesses = []
for city in BUSINESSES:
    frame = pd.DataFrame(BUSINESSES[city])
    list_of_frames_businesses.append(frame)

df_totaal_bus = pd.concat(list_of_frames_businesses)
df_totaal_bus = df_totaal_bus[df_totaal_bus['business_id'].isin(list(df_werkbaar['business_id'].values))]
df_totaal_bus.fillna(value='Not Available', inplace=True)
df_bus_for_utility = extract_genres(df_totaal_bus)
df_bus_for_utility_complete = pivot_genres(df_bus_for_utility)

In [39]:
import numpy as np
def create_similarity_matrix_categories(matrix):
    """Create a  """
    npu = matrix.values
    m1 = npu @ npu.T
    diag = np.diag(m1)
    m2 = m1 / diag
    m3 = np.minimum(m2, m2.T)
    return pd.DataFrame(m3, index = matrix.index, columns = matrix.index)

In [40]:
df_predicted_cf_item_based = predict_ratings(df_similarity_ratings, df_utility_ratings, df_ratings_test)

In [41]:
df_predicted_content_based = create_similarity_matrix_categories(df_bus_for_utility_complete)

In [42]:
from random import randrange
df_ratings_random = df_werkbaar.copy()
df_ratings_random['predicted rating'] = [randrange(1,5) for i in range(len(df_werkbaar))]
mse_random = mse(df_ratings_random)
print(f'mse for random prediction: {mse_random:.2f}')

mse for random prediction: 4.06


In [43]:
mse_cf_item_based = mse(df_predicted_cf_item_based)
print(f'mse for item based collaborative filtering {mse_cf_item_based:.2f}')

mse for item based collaborative filtering 5.02


In [44]:
df_predicted_on_genre = predict_ratings(df_predicted_content_based,df_utility_ratings, df_ratings_test)
mse_genres =  mse(df_predicted_on_genre)
print(f'mse for content based filtering: {mse_genres:.2f}')

mse for content based filtering: 2.93
