In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import train_test_split

In [4]:
movie_data = pd.read_csv("../useful_dataset/recommendations/movies.csv")
rating_data = pd.read_csv("../useful_dataset/recommendations/ratings.csv")

In [6]:
svd_model = SVD(42)

# Prepare data for the Surprise library
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(rating_data[['userId', 'movieId', 'rating']], reader)

# Split into training and testing sets
trainset, _ = train_test_split(data, test_size=.2)

# Train the SVD model
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21812ba38f0>

In [7]:
# Train the KNN model
sim_options = {'name': 'cosine', 'user_based': False}  # Item-based similarity
knn_model = KNNBasic(sim_options=sim_options)
knn_model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x218437fc3b0>

In [None]:
def recommend_movies(self, liked_movie, top_n=10):
    # Find movieId of liked_movie
    # self._load_data()
    movie = self.movie_data[self.movie_data['title'].str.contains(liked_movie, case=False)]
    if len(movie) == 0:
        return "Movie not found in database"

    liked_movie_id = movie['movieId'].values[0]

    # Predict ratings for all movies
    all_movie_ids = self.movie_data['movieId'].unique()
    predicted_ratings = [(mid, self.svd_model.predict(uid=1, iid=mid).est) for mid in all_movie_ids]

    # Sort by estimated rating
    sorted_predictions = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)
    best_match = next((mid for mid, _ in sorted_predictions if mid != liked_movie_id), None)

    # Get the title of the best match
    best_match_title = self.movie_data[self.movie_data['movieId'] == best_match]['title'].values[0:top_n]

    # Filter out the liked movie and fetch titles
    # recommendations = [(self.movie_data[self.movie_data['movieId'] == mid]['title'].values[0], score) for mid, score
    #                    in sorted_predictions if mid != liked_movie_id][:top_n]

    return best_match_title

In [None]:
def recommend_movies_using_kkn(self, liked_movie, top_n=10):
    """Recommend movies similar to the liked_movie."""
    # Find the movie ID
    liked_movie_row = self.movie_data[self.movie_data['title'].str.contains(liked_movie, case=False)]
    if liked_movie_row.empty:
        return "Movie not found in database."

    liked_movie_id = liked_movie_row['movieId'].values[0]

    # Get the inner ID for the liked movie
    liked_movie_inner_id = self.knn_model.trainset.to_inner_iid(liked_movie_id)

    # Find neighbors
    neighbors = self.knn_model.get_neighbors(liked_movie_inner_id, k=top_n)

    # Convert inner IDs to movie IDs
    recommended_ids = [self.knn_model.trainset.to_raw_iid(inner_id) for inner_id in neighbors]

    # Fetch titles for recommended movie IDs
    recommended_titles = self.movie_data[self.movie_data['movieId'].isin(recommended_ids)]['title'].tolist()

    return recommended_titles

# Version 253156714532451


In [27]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd
import numpy as np

In [28]:
# Load the MovieLens dataset
ratings_file_path = "../useful_dataset/recommendations/ratings.csv"  # Path to MovieLens ratings.csv
movies_file_path = "../useful_dataset/recommendations/movies.csv"  # Path to MovieLens movies.csv
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file(ratings_file_path, reader=reader)

In [29]:
# Load movies data to get movie names
df_movies = pd.read_csv(movies_file_path)
movie_id_to_name = dict(zip(df_movies['movieId'], df_movies['title']))
movie_name_to_id = dict(zip(df_movies['title'], df_movies['movieId']))

In [30]:
# Ensure selected movies are part of the training set
selected_movie_ids = [movie_name_to_id.get(name) for name in ["Toy Story (1995)", "Jumanji (1995)"] if name in movie_name_to_id]
filtered_ratings = data.raw_ratings
filtered_ratings = [rating for rating in filtered_ratings if rating[1] in selected_movie_ids]
data.raw_ratings = filtered_ratings + data.raw_ratings  # Adding the selected movies to ensure they're part of the dataset

In [31]:
# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

In [32]:
# Use KNNBasic algorithm to train a collaborative filtering model
sim_options = {
    'name': 'cosine',
    'user_based': False  # Item-based collaborative filtering
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x23c4c8ffdd0>

In [None]:
# Function to get movie recommendations
def get_movie_recommendations(movie_names, num_recommendations=5):
    # Convert movie names to movie IDs
    movie_ids = [movie_name_to_id.get(name) for name in movie_names if name in movie_name_to_id]

    # Find movie inner IDs from raw IDs
    movie_inner_ids = []
    for movie_id in movie_ids:
        try:
            movie_inner_ids.append(trainset.to_inner_iid(movie_id))
        except ValueError:
            print(f"Movie ID {movie_id} is not part of the trainset. Skipping this movie.")

    if not movie_inner_ids:
        print("No valid movie IDs provided.")
        return []

    # Calculate similarity scores for the provided movies
    similarities = np.zeros(trainset.n_items)
    for inner_id in movie_inner_ids:
        movie_similarities = model.sim[inner_id]
        similarities += movie_similarities

    # Sort the items based on similarity scores
    similar_items = np.argsort(similarities)[::-1]

    # Filter out the provided movies
    recommendations = [trainset.to_raw_iid(inner_id) for inner_id in similar_items if inner_id not in movie_inner_ids]

    # Get movie names for recommendations
    recommended_movie_names = [movie_id_to_name[int(movie_id)] for movie_id in recommendations[:num_recommendations] if int(movie_id) in movie_id_to_name]

    return recommended_movie_names

In [38]:
# Example usage
movie_names = ["Tomy Story (1995)", "Jumanji (1995)"]  # Provide movie names for which recommendations are needed
recommended_movies = get_movie_recommendations(movie_names, num_recommendations=5)
print(f'Recommended Movies: {recommended_movies}')

Movie ID 1 is not part of the trainset. Skipping this movie.
Movie ID 2 is not part of the trainset. Skipping this movie.
No valid movie IDs provided.
Recommended Movies: []


# Version 253156714532451 + 2439012348903491283672348981995

In [1]:
from app.config.enums import Environment
from app.services.sparql_graph import SPARQLGraph

In [2]:
sparql_graph = SPARQLGraph(Environment.PROD, False)  # Your initialized SPARQL graph

Metadata loaded successfully from JSON files.
Initializing SPARQLGraph
Graph loaded with 2056777 triples after 0:00:53.265310


In [4]:
def get_movie_attributes(movie_id: str) -> dict:
    """
    Queries the knowledge graph to get all attributes and values for a specific movie.
    """
    query = f"""PREFIX wd: <http://www.wikidata.org/entity/>
                    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                    SELECT ?property ?value
                    WHERE {{
                        wd:{movie_id} ?property ?value .
                    }}"""
    results = sparql_graph.execute_query_plain_answer(query)
    print("Movie Result attributes", results)
    attributes = {}
    for result in results:
        property_, value = result
        if property_ and value:
            attributes[property_] = value
    return attributes

def find_common_attributes(movies: list) -> dict:
    """
    Finds attributes that are common among multiple movies and have the same values.
    """
    movie_attributes = [get_movie_attributes(movie) for movie in movies]

    # Start with the attributes and values of the first movie
    common_attributes = movie_attributes[0]

    # Iterate over the remaining movies
    for attributes in movie_attributes[1:]:
        # Keep only attributes that exist in the current movie and have the same value
        common_attributes = {
            attr: value
            for attr, value in common_attributes.items()
            if attr in attributes and attributes[attr] == value
        }

    return common_attributes


def find_movie_by_attributes(common_attributes, movie_ids: list):
    # Construct SPARQL filters with proper handling for IRIs and literals
    filters = "\n".join(
        f"{format_sparql_value("?movie", attr, value, ".")}" for attr, value in common_attributes.items()
    )
    query = f"""PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                SELECT ?movie
                WHERE {{
                    {filters}
                    # Exclude specific movies
                    # FILTER (?movie NOT IN ({", ". join(["wd:" + item for item in movie_ids])}))
                }}
                LIMIT 10"""

    print(f"SPARQL Query:\n{query}")  # Debugging
    response = sparql_graph.execute_query(query)
    response_lbls = [sparql_graph.get_lbl_for_ent(item) for item in response.split("\n") if item]
    print(response_lbls)
    #response_lbl = sparql_graph.get_lbl_for_ent(response)
    print("Response:", response_lbls)  # Print response for debugging
    return response_lbls

def get_similar_movie(movie_ids: list) -> str:
    """
    Finds a movie similar to the given movies based on common attributes.
    """
    common_attributes = find_common_attributes(movie_ids)
    to_delete = []
    for key, value in common_attributes.items():
        relation = sparql_graph.get_lbl_for_ent(str(key))
        val = sparql_graph.get_lbl_for_ent(str(value))
        # if val == "Unknown Label":
        #     to_delete.append(key)
        #     continue

        print(f"{relation}: {val}")  # Debugging

    for key in to_delete:
        del common_attributes[key]

    return find_movie_by_attributes(common_attributes, movie_ids)

def format_sparql_value(prefix, relation, value, suffix):
    """
    Formats a value for use in a SPARQL query.
    - IRIs are enclosed in < >.
    - Strings are enclosed in quotes, optionally with language tags.
    - Dates and other literals use appropriate SPARQL formats.
    """

    # if "http://www.wikidata.org/entity/" in value:
    #     return f"{prefix} wd:{value.split("/")[-1]} {suffix}"
    # elif "http://www.wikidata.org/propt/direct/" in value:
    #     return f"{prefix} wdt:{value.split("/")[-1]} {suffix}"
    # else:
    #     return ""

    return f"{prefix} <{str(relation)}> <{str(value)}> {suffix}"

    # return f"{prefix} wdt:{str(relation).split("/")[-1]} wd:{str(value).split("/")[-1]} {suffix}"

movie_ids = [sparql_graph.get_id_for_movie("The Matrix").split("/")[-1], sparql_graph.get_id_for_movie("Pulp Fiction").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("The Lord of the Rings: The Fellowship of the Ring").split("/")[-1], sparql_graph.get_id_for_movie("The Princess Bride").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("Avengers: Endgame").split("/")[-1], sparql_graph.get_id_for_movie("Thor: Ragnarok").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("Inception").split("/")[-1], sparql_graph.get_id_for_movie("The Matrix").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("Kill Bill: Volume 1").split("/")[-1], sparql_graph.get_id_for_movie("Django Unchained").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

Movie Result attributes <rdflib.plugins.sparql.processor.SPARQLResult object at 0x000002A3C7697140>
Movie Result attributes <rdflib.plugins.sparql.processor.SPARQLResult object at 0x000002A3C77F0590>
FSK film rating: FSK 16
aspect ratio: 2.35:1
instance of: film
original language of film or TV show: English
distribution format: video on demand
color: color
assessment: Bechdel test
distributed by: Netflix
SPARQL Query:
PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                SELECT ?movie
                WHERE {
                    ?movie <http://www.wikidata.org/prop/direct/P1981> <http://www.wikidata.org/entity/Q20644797> .
?movie <http://www.wikidata.org/prop/direct/P2061> <http://www.wikidata.org/entity/Q21062414> .
?movie <http://www.wikidata.org/prop/direct/P31> <http://www.wikidata.org/entity/Q11424> .
?movie <http://www.wikidata.org/prop/direct/P364> <http://www.wikidata.org/entity/Q1860> .
?movie <http://www

ValueError: not enough values to unpack (expected 2, got 1)

In [17]:
def get_movie_attributes(movie_id: str) -> dict:
    """
    Queries the knowledge graph to get all attributes and values for a specific movie.
    """
    query = f"""PREFIX wd: <http://www.wikidata.org/entity/>
                    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                    SELECT ?property ?value
                    WHERE {{
                        wd:{movie_id} ?property ?value .
                    }}"""
    results = sparql_graph.execute_query_plain_answer(query)
    print("Movie Result attributes", results)
    attributes = {}
    for result in results:
        print(result)
        property_, value = result
        if property_ and value:
            attributes[property_] = value
    return attributes

def find_common_attributes(movies: list) -> dict:
    """
    Finds attributes that are common among multiple movies and have the same values.
    """
    movie_attributes = [get_movie_attributes(movie) for movie in movies]

    # Start with the attributes and values of the first movie
    common_attributes = movie_attributes[0]

    # Iterate over the remaining movies
    for attributes in movie_attributes[1:]:
        # Keep only attributes that exist in the current movie and have the same value
        common_attributes = {
            attr: value
            for attr, value in common_attributes.items()
            if attr in attributes and attributes[attr] == value
        }

    return common_attributes


def find_movie_by_attributes(common_attributes, movie_ids: list):
    # Construct SPARQL filters with proper handling for IRIs and literals
    filters = "\n".join(
        f"{format_sparql_value("?movie", attr, value, ".")}" for attr, value in common_attributes.items()
    )
    query = f"""PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                SELECT ?movie
                WHERE {{
                    {filters}
                    # Exclude specific movies
                    # FILTER (?movie NOT IN ({", ". join(["wd:" + item for item in movie_ids])}))
                }}
                LIMIT 10"""

    print(f"SPARQL Query:\n{query}")  # Debugging
    response = sparql_graph.execute_query(query)
    response_lbls = [sparql_graph.get_lbl_for_ent(item) for item in response.split("\n") if item]
    print(response_lbls)
    #response_lbl = sparql_graph.get_lbl_for_ent(response)
    print("Response:", response_lbls)  # Print response for debugging
    return response

def get_similar_movie(movie_ids: list) -> str:
    """
    Finds a movie similar to the given movies based on common attributes.
    """
    common_attributes = find_common_attributes(movie_ids)
    to_delete = []
    for key, value in common_attributes.items():
        relation = sparql_graph.get_lbl_for_ent(str(key))
        val = sparql_graph.get_lbl_for_ent(str(value))
        if "http" in val:
            to_delete.append(key)
            continue

        print(f"{relation}: {val}")  # Debugging

    for key in to_delete:
        del common_attributes[key]

    return find_movie_by_attributes(common_attributes, movie_ids)

def format_sparql_value(prefix, relation, value, suffix):
    """
    Formats a value for use in a SPARQL query.
    - IRIs are enclosed in < >.
    - Strings are enclosed in quotes, optionally with language tags.
    - Dates and other literals use appropriate SPARQL formats.
    """

    # if "http://www.wikidata.org/entity/" in value:
    #     return f"{prefix} wd:{value.split("/")[-1]} {suffix}"
    # elif "http://www.wikidata.org/propt/direct/" in value:
    #     return f"{prefix} wdt:{value.split("/")[-1]} {suffix}"
    # else:
    #     return ""

    if "http" in str(value):
        value = f"<{str(value)}>"
    else:
        value = f"\"{str(value)}\"@en"

    return f"{prefix} <{str(relation)}> {value} {suffix}"

    # return f"{prefix} wdt:{str(relation).split("/")[-1]} wd:{str(value).split("/")[-1]} {suffix}"

movie_ids = [sparql_graph.get_id_for_movie("The Matrix").split("/")[-1], sparql_graph.get_id_for_movie("Pulp Fiction").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("The Lord of the Rings: The Fellowship of the Ring").split("/")[-1], sparql_graph.get_id_for_movie("The Princess Bride").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("Avengers: Endgame").split("/")[-1], sparql_graph.get_id_for_movie("Thor: Ragnarok").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("Inception").split("/")[-1], sparql_graph.get_id_for_movie("The Matrix").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

movie_ids = [sparql_graph.get_id_for_movie("Kill Bill: Volume 1").split("/")[-1], sparql_graph.get_id_for_movie("Django Unchained").split("/")[-1]]
print(f"Found a similar movie: {get_similar_movie(movie_ids)}\n\n")

Movie Result attributes <rdflib.plugins.sparql.processor.SPARQLResult object at 0x000002A3C78F9A90>
(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'), rdflib.term.Literal('The Matrix', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('action', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('alternate_reality', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('atmospheric', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('boring', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('cult', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('dark', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('dramatic', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdflib.term.Literal('fantasy', lang='en'))
(rdflib.term.URIRef('http://ddis.ch/atai/tag'), rdfl

ValueError: not enough values to unpack (expected 2, got 1)

# Version 253156714532451 + 2439012348903491283672348981995 + 458349759384573904

In [None]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
ml_ratings = pd.read_csv('../useful_dataset/recommendations/ratings.csv', usecols=['userId', 'movieId', 'rating'])
ml_matrix = ml_ratings.pivot(index='movieId', columns='userId', values='rating').filln(0)
ml_csr = csr_matrix(ml_matrix.values)
movie_df = pd.read_csv('../useful_dataset/recommendations/movies.csv', usecols=['movieId', 'title'])

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=17)
knn.fit(ml_csr[:, :])


# Something totally different

In [1]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import pandas as pd

In [2]:
# Step 1: Load the dataset
def load_movielens_data():
    # Load the MovieLens 100k dataset (ratings.csv)
    ratings = pd.read_csv("../useful_dataset/recommendations/ratings.csv")
    movies = pd.read_csv("../useful_dataset/recommendations/movies.csv")

    # Merge ratings and movies
    data = pd.merge(ratings, movies, on='movieId')
    return data, ratings, movies

data, ratings, movies = load_movielens_data()

In [3]:
# Step 2: Train a collaborative filtering model
def train_model(ratings):
    reader = Reader(rating_scale=(0.5, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    trainset, testset = train_test_split(data, test_size=0.25)

    # Use SVD (Singular Value Decomposition)
    algo = SVD()
    algo.fit(trainset)
    return algo

algo = train_model(ratings)

In [4]:
# Step 3: Recommend a movie based on input movies
def recommend_movie(movie_titles, algo, data, ratings, movies):
    # Find movieIds for given titles
    movie_ids = movies[movies['title'].isin(movie_titles)]['movieId'].tolist()
    if not movie_ids:
        return "Movies not found in the dataset."

    # Find users who rated these movies highly
    users = ratings[ratings['movieId'].isin(movie_ids) & (ratings['rating'] >= 4.0)]['userId'].unique()

    # Predict ratings for these users for all movies
    all_movie_ids = movies['movieId'].unique()
    predicted_ratings = []
    for user in users:
        for movie_id in all_movie_ids:
            if movie_id not in movie_ids:  # Avoid recommending input movies
                pred = algo.predict(user, movie_id)
                predicted_ratings.append((movie_id, pred.est))

    # Sort by predicted rating
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)

    # Return the top recommended movie
    recommended_movie_id = predicted_ratings[0][0]
    recommended_movie_title = movies[movies['movieId'] == recommended_movie_id]['title'].iloc[0]
    return recommended_movie_title

In [5]:
# Main script
input_movies = ["Toy Story (1995)", "Jumanji (1995)"]  # Example movies
recommended_movie = recommend_movie(input_movies, algo, data, ratings, movies)
print(f"Recommended Movie: {recommended_movie}")

Recommended Movie: Usual Suspects, The (1995)
