In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from zipfile import ZipFile

from mimesis import Person
from mimesis.locales import Locale



In [2]:
DUMMY_DATA_DIR = Path("assets/dummy_data")

BOOK_TABEL_FILENAME = "books_df.csv"

USER_TABEL_FILENAME = "users_df.csv"

RATING_TABEL_FILNAME = "ratings_df.csv"


In [3]:
DATA_DIR = Path("assets/books.zip")

with ZipFile(DATA_DIR) as z:
    files = (file for file in z.namelist() if Path(file).suffix == ".csv")
    data = {Path(file).stem.lower(): pd.read_csv(z.extract(file),low_memory=False) for file in files}
    

In [4]:
data.get("books").sample(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
221332,517571684,Mary Emmerling's American Country Classics: Th...,Mary Ellisor Emmerling,1990,Rutland Group,http://images.amazon.com/images/P/0517571684.0...,http://images.amazon.com/images/P/0517571684.0...,http://images.amazon.com/images/P/0517571684.0...
124140,449134164,The House Guests (Gm),John D. MacDonald,1973,Fawcett Books,http://images.amazon.com/images/P/0449134164.0...,http://images.amazon.com/images/P/0449134164.0...,http://images.amazon.com/images/P/0449134164.0...
45771,553212834,"Four Tragedies: Hamlet, Othello, King Lear, Ma...",William Shakespeare,1988,Bantam,http://images.amazon.com/images/P/0553212834.0...,http://images.amazon.com/images/P/0553212834.0...,http://images.amazon.com/images/P/0553212834.0...


In [5]:
def generate_names(unique_ids: set[str]) -> dict[str, str]:
    
    """Generate Fake Names matching unique ids
    """
    number_of_ids = len(unique_ids)
    person = Person(Locale.DA)
    unique_names = set()

    # Generate the specified number of female names
    while len(unique_names) < number_of_ids:
        name = person.full_name()
        unique_names.add(name)

    return {id:name for id, name in zip(unique_ids, unique_names)}


def filter_ratings(data, num_samples=40, random_state=40):
    ratings_df = data['ratings']
    books_list = ratings_df.sample(num_samples, random_state=random_state).ISBN.to_list()
    filtered_ratings_df = ratings_df[ratings_df['ISBN'].isin(books_list)]

    bins = [0, 5, 10]
    labels = ["Did Not Like", 'Liked']
    rating_cat = pd.cut(filtered_ratings_df["Book-Rating"], bins=bins, labels=labels, right=False)

    filtered_ratings_df.loc[:, 'Rating_cat'] = rating_cat
    filtered_ratings_df.loc[:, 'ISBN'] = filtered_ratings_df["ISBN"].replace("0440514428","038531258X") # mistake in data
    filtered_ratings_df.to_csv(f"{DUMMY_DATA_DIR}/{RATING_TABEL_FILNAME}", index=False)
    return filtered_ratings_df

def generate_name_mapping(unique_ids):
    id_to_name = generate_names(unique_ids=unique_ids)
    return id_to_name

def generate_random_numbers_and_age_groups(size):
    np.random.seed(0)
    random_numbers = np.random.normal(loc=50, scale=20, size=size)
    random_numbers = np.clip(random_numbers, 8, 90)

    bins = [0, 11, 18, 25, 55, 110]
    labels = ['Kid', 'Teen', 'Young Adult', 'Adult', 'Old Person']
    age_group = pd.cut(random_numbers, bins=bins, labels=labels, right=False)
    age = random_numbers.round()

    return age_group, age

def generate_user_data(data):
    users_df = data["users"]
    ratings_df = filter_ratings(data)
    users_list = ratings_df["User-ID"].to_list()

    filtered_users_df = users_df[users_df["User-ID"].isin(users_list)]

    unique_ids = filtered_users_df["User-ID"].unique()
    id_to_name = generate_name_mapping(unique_ids)

    age_group, age = generate_random_numbers_and_age_groups(len(filtered_users_df))

    filtered_users_df.loc[:, 'User-Name'] = filtered_users_df["User-ID"].map(id_to_name)
    filtered_users_df.loc[:, 'AgeGroup'] = age_group
    filtered_users_df.loc[:, 'Age'] = age

    filtered_users_df.to_csv(f"{DUMMY_DATA_DIR}/{USER_TABEL_FILENAME}", index=False)
    return filtered_users_df

def process_books_data(data, preprocessed_users_df):
    books_df = data["books"][['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
    filtered_ratings_df = filter_ratings(data)

    books_df = books_df[books_df['ISBN'].isin(filtered_ratings_df["ISBN"].tolist())]
    
    # Define a list of genres
    genres = ['Mystery', 'Science Fiction', 'Romance', 'Fantasy', 'Thriller']
    
    # Add a 'Genre' column with randomly assigned genres
    books_df['Genre'] = np.random.choice(genres, size=len(books_df))
    books_df.to_csv(f"{DUMMY_DATA_DIR}/{BOOK_TABEL_FILENAME}", index=False)
    return books_df

filtered_users_df = generate_user_data(data)
processed_books_df = process_books_data(data, filtered_users_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_ratings_df.loc[:, 'Rating_cat'] = rating_cat
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_users_df.loc[:, 'User-Name'] = filtered_users_df["User-ID"].map(id_to_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_users_df.loc[:, 'AgeGroup'] = age_group
A value is trying

In [6]:
import duckdb

key_1 = "ISBN"
key_2 = "User-ID"

# Create a new table and insert the results of the JOIN query into it
duckdb.sql(
    f"""
    CREATE TABLE joined_table AS
    SELECT *
    FROM "{DUMMY_DATA_DIR}/{RATING_TABEL_FILNAME}" AS ratings
    JOIN "{DUMMY_DATA_DIR}/{BOOK_TABEL_FILENAME}" AS books ON (books.{key_1} = ratings.{key_1})
    JOIN "{DUMMY_DATA_DIR}/{USER_TABEL_FILENAME}" AS users ON (users."{key_2}" = ratings."{key_2}")
    """
)

In [7]:
# Select the data from the newly created table
dataf = duckdb.sql(
    """
    SELECT ISBN, "Book-Title", "Book-Author", "Year-Of-Publication", Publisher, Genre, "User-ID", "User-Name",
            "Book-Rating", "Rating_cat" ,Location, Age, AgeGroup
    FROM joined_table
    """
).df()#.sample(500)
dataf.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Genre', 'User-ID', 'User-Name', 'Book-Rating', 'Rating_cat',
       'Location', 'Age', 'AgeGroup'],
      dtype='object')

In [8]:
from ast import literal_eval
import json
# from decouple import config ".py
from decouple import AutoConfig # notebooks
import duckdb
import numpy as np
import pandas as pd
from mimesis import Person
from mimesis.locales import Locale

from neomodel import (config as cf,
                      db,
                      StructuredNode,
                      StringProperty,
                      StructuredRel,
                      DateTimeProperty,
                      RelationshipTo,
                      RelationshipFrom,
                      UniqueIdProperty,
                      FloatProperty,


)

from neomodel.contrib.spatial_properties import PointProperty, NeomodelPoint

In [9]:
cf.AUTO_INSTALL_LABELS = True
cf.ENCRYPTED_CONNECTION = False

#NEO4J_BOLT_URL="bolt://neo4j:metterocks@localhost:7687"
db.set_connection("bolt://neo4j:metterocks@localhost:7687")

db._NODE_CLASS_REGISTRY
db._NODE_CLASS_REGISTRY = {}



In [10]:
#delete all nodes
db.cypher_query(
"MATCH (n) DETACH DELETE n"
)

#delete all nodes and relationships
db.cypher_query(
"MATCH () <- [n] - () DELETE n"
)

([], [])

In [11]:
class global_values(StructuredRel):
   average_rating  = FloatProperty()
   typical_num_ratings = FloatProperty()

class Rated(StructuredRel):
    rating = FloatProperty()

class Book(StructuredNode):
    title = StringProperty(unique_index=True)
    author = StringProperty(unique_index=True)
    genre = StringProperty(required=True)
    isbn = UniqueIdProperty()

    written_by = RelationshipTo('Author', 'WRITTEN_BY')
    belongs_to = RelationshipTo('Genre', 'BELONGS_TO')
    rated_by = RelationshipTo('User', 'RATED', model=Rated)  # Add this line


class AgeGroup(StructuredNode):
    age = StringProperty(required=True)

class User(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(required=True)

    read = RelationshipTo('Book', 'READ')
    is_in = RelationshipTo('AgeGroup', 'IS_IN')
    liked = RelationshipTo('Book', 'LIKED')
    hate = RelationshipTo('Book', 'HATE')
    rated = RelationshipTo('Book', 'RATED', model=Rated)  # Add this line


class Author(StructuredNode):
    name = StringProperty(unique_index=True)

class Genre(StructuredNode):
    name = StringProperty(unique_index=True)
     

 + Creating node unique constraint for title on label Book for class __main__.Book
{code: Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists} {message: An equivalent constraint already exists, 'Constraint( id=18, name='constraint_unique_Book_title', type='UNIQUENESS', schema=(:Book {title}), ownedIndex=17 )'.}
 + Creating node unique constraint for author on label Book for class __main__.Book
{code: Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists} {message: An equivalent constraint already exists, 'Constraint( id=22, name='constraint_unique_Book_author', type='UNIQUENESS', schema=(:Book {author}), ownedIndex=21 )'.}
 + Creating node unique constraint for isbn on label Book for class __main__.Book
{code: Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists} {message: An equivalent constraint already exists, 'Constraint( id=10, name='constraint_unique_Book_isbn', type='UNIQUENESS', schema=(:Book {isbn}), ownedIndex=9 )'.}
 + Creating node unique constraint for uid on l

In [12]:
# a = Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
#        'Genre', 'User-ID', 'Book-Rating', 'Location', 'Age', 'AgeGroup'],
#       dtype='object')

In [13]:
# USERS
for user_name, user_id in (dataf[["User-Name", 'User-ID']]
                  .drop_duplicates().values.tolist()
):
    User(name=user_name, uid=user_id).save()

# BOOKS
for book_title, isbn, book_author, book_genre in (dataf[["Book-Title", "ISBN", "Book-Author", "Genre"]]
                  .drop_duplicates().values.tolist()

):
    Book(title=book_title, uid=isbn, author=book_author, genre=book_genre).save()

# AUTHORS
for book_author in (dataf["Book-Author"]
                  .drop_duplicates().values.tolist()
):
    Author(name=book_author).save()

# AGEGROUPS
for age_group in (dataf["AgeGroup"]
                  .drop_duplicates().values.tolist()
):
    AgeGroup(age=age_group).save()

# Genres
for genre in (dataf["Genre"]
                  .drop_duplicates().values.tolist()
):
    Genre(name=genre).save()

In [14]:
for book_title, author_name, user_id, age_group_value, genre_name, rating in dataf[["Book-Title", "Book-Author", "User-ID", "AgeGroup", "Genre", "Book-Rating"]].drop_duplicates().values.tolist():
    # Find all matching books, authors, users, age groups, and genres
    books = Book.nodes.filter(title=book_title)
    authors = Author.nodes.filter(name=author_name)
    users = User.nodes.filter(uid=user_id)
    ages = AgeGroup.nodes.filter(age=age_group_value)
    genres = Genre.nodes.filter(name=genre_name)

    # Create relationships between books and authors, books and genres
    for book_node in books:
        for author_node in authors:
            book_node.written_by.connect(author_node)

        for genre_node in genres:
            book_node.belongs_to.connect(genre_node)

    # Create relationships between users and books (read and rated), users and age groups
    for user_node in users:
        for book_node in books:
            # Connect read relationship
            user_node.read.connect(book_node)

            # Connect rated relationship, assuming a 'Rated' relationship class exists
            user_node.rated.connect(book_node, {'rating': rating})

        #for age_node in ages:
            #user_node.is_in.connect(age_node)


In [15]:
df_like = dataf[dataf["Rating_cat"]=="Liked"]

In [16]:
for book_title, user_id, in df_like[["Book-Title", "User-ID"]].drop_duplicates().values.tolist():
    books = Book.nodes.filter(title=book_title)
    users = User.nodes.filter(uid=user_id)
    
    for book_node in books:
        for user_node in users:
            user_node.liked.connect(book_node)


In [17]:
# Calculate Global values used for statistics
db.cypher_query(
    """
    // Calculate global values
    MATCH (book:Book)<-[r:RATED]-(:User)
    WITH COUNT(r) AS TotalRatings, SUM(r.rating) AS SumRatings
    MATCH (book:Book)
    WITH TotalRatings, SumRatings, COUNT(book) AS TotalBooks

    // Now carry forward the global values for the Bayesian calculation
    WITH SumRatings / TotalRatings AS GlobalAverageRating, TotalRatings / TotalBooks AS TypicalNumRatings

    // Create or update the GlobalValues node
    MERGE (g:GlobalValues {id: 1})
    SET g.average_rating = GlobalAverageRating, g.typical_num_ratings = TypicalNumRatings
    RETURN g
    """
)

([[<Node element_id='4:e178d927-297a-4570-8eae-8af6f2348cf6:17547' labels=frozenset({'GlobalValues'}) properties={'typical_num_ratings': 42, 'average_rating': 2.8495934959349594, 'id': 1}>]],
 ['g'])

In [18]:
# Calculating Bayasian Average Rating for ench Book

db.cypher_query(
    """
    // Match the GlobalValues node and carry forward its properties
    MATCH (g:GlobalValues)
    WITH g.average_rating AS GlobalAverageRating, g.typical_num_ratings AS TypicalNumRatings

    // Calculate Bayesian average for each book and set it as a property
    MATCH (book:Book)<-[r:RATED]-(:User)
    WITH book, AVG(r.rating) AS AverageRating, COUNT(r) AS TotalRatings, GlobalAverageRating, TypicalNumRatings
    SET book.BayesianAverage = ((TypicalNumRatings * GlobalAverageRating) + (AverageRating * TotalRatings)) / (TypicalNumRatings + TotalRatings)
    RETURN book.title AS BookTitle, AverageRating, TotalRatings, book.BayesianAverage
    ORDER BY book.BayesianAverage DESC
    """
)


([["Big Stone Gap: A Novel (Ballantine Reader's Circle)",
   3.791208791208791,
   182,
   3.614655923344948],
  ['How to Lose Friends &amp; Alienate People', 4.5, 20, 3.3819826907946497],
  ['The Witching Hour (Lives of the Mayfair Witches)',
   4.096774193548387,
   31,
   3.3792181757434014],
  ['What Looks Like Crazy On An Ordinary Day',
   3.4739336492890995,
   211,
   3.370288248337029],
  ['Meeting Luciano (Ballantine Readers Circle)', 4.5, 10, 3.1669793621013134],
  ['The Dice Man', 4.428571428571429, 7, 3.0751617720258837],
  ["Santa's Holiday Treats: A Wilton Book of Recipes &amp; Ideas",
   8.0,
   1,
   2.9693703913783325],
  ['TELECOSM: How Infinite Bandwidth will Revolutionize Our World',
   3.6666666666666665,
   6,
   2.9517276422764227],
  ['Astrotherapy: Astrology and the Realization of the Self (Arkana S.)',
   7.0,
   1,
   2.946114577424844],
  ['Middlesex: A Novel', 2.972477064220184, 109, 2.938297528670651],
  ['A Place of Execution', 2.9615384615384612, 52, 2.9

In [19]:
# book_ratings = db.cypher_query(
# """
# // Calculate global values
# MATCH (book:Book)<-[r:RATED]-(:User)
# WITH COUNT(r) AS TotalRatings, SUM(r.rating) AS SumRatings
# MATCH (book:Book)
# WITH TotalRatings, SumRatings, COUNT(book) AS TotalBooks

# // Now carry forward the global values for the Bayesian calculation
# WITH SumRatings / TotalRatings AS GlobalAverageRating, TotalRatings / TotalBooks AS TypicalNumRatings, TotalRatings

# // Calculate Bayesian average for each book and set it as a property
# MATCH (book:Book)<-[r:RATED]-(:User)
# WITH book, AVG(r.rating) AS AverageRating, COUNT(r) AS TotalRatings, GlobalAverageRating, TypicalNumRatings
# SET book.BayesianAverage = ((TypicalNumRatings * GlobalAverageRating) + (AverageRating * TotalRatings)) / (TypicalNumRatings + TotalRatings)
# RETURN book.title AS BookTitle, AverageRating, TotalRatings, book.BayesianAverage
# ORDER BY book.BayesianAverage DESC
# """
# )
# ratings_df = pd.DataFrame(book_ratings[0], columns=["BookTitle","AverageRating", "TotalRatings", "BayesianAverage"])

In [20]:
author_ratings = db.cypher_query(
"""
// Calculate global values
MATCH (book:Book)<-[r:RATED]-(:User)
WITH COUNT(r) AS TotalRatings, SUM(r.rating) AS SumRatings
MATCH (book:Book)
WITH TotalRatings, SumRatings, COUNT(book) AS TotalBooks

// Now carry forward the global values for the Bayesian calculation
WITH SumRatings / TotalRatings AS GlobalAverageRating, TotalRatings / TotalBooks AS TypicalNumRatings

// Calculate Bayesian average for each author and set it as a property
MATCH (author:Author)<-[:WRITTEN_BY]-(book:Book)<-[r:RATED]-(:User)
WITH author, AVG(r.rating) AS AverageRating, COUNT(r) AS TotalRatings, GlobalAverageRating, TypicalNumRatings
SET author.BayesianAverage = ((TypicalNumRatings * GlobalAverageRating) + (AverageRating * TotalRatings)) / (TypicalNumRatings + TotalRatings)
RETURN author.name AS AuthorName, AverageRating, TotalRatings, author.BayesianAverage
ORDER BY author.BayesianAverage DESC
"""
)


In [21]:
def get_random_user_name():
        result = db.cypher_query(
            """
            MATCH (user:User)
            WITH COLLECT(user.name) AS names
            RETURN apoc.coll.randomItem(names) AS RandomUserName
        """)
        return result[0][0][0]

In [31]:
USER_NAME = get_random_user_name()
USER_NAME

'Inga Korsholm'

In [33]:
USER_NAME = get_random_user_name()
book_list = db.cypher_query(
    """
    MATCH (userA:User {name: $USER_NAME})-[:LIKED]->(sharedBook:Book)<-[:LIKED]-(otherUser:User)
    WHERE userA <> otherUser
    WITH DISTINCT otherUser
    MATCH (otherUser)-[:LIKED]->(book:Book)
    WITH DISTINCT book
    RETURN COLLECT(book.title) AS UniqueBooksReadByOthers
    """,
    params={"USER_NAME": USER_NAME}
)[0][0][0]
book_list

['What Looks Like Crazy On An Ordinary Day',
 'Saint Maybe',
 'Call of the Wild',
 'Nisa: The Life and Words of a !Kung Woman',
 'Trading Places',
 "Big Stone Gap: A Novel (Ballantine Reader's Circle)"]

In [45]:
db.cypher_query(
"""
MATCH (userA:User {name: $USER_NAME})-[:LIKED]->(sharedBooks:Book)<-[:LIKED]-(otherUser:User)
WITH DISTINCT otherUser, sharedBooks 
MATCH (otherUser) -[:LIKED]->(otherBooks:Book) //-[]->(g:Genre)
WHERE sharedBooks <> otherBooks
WITH sharedBooks.genre, otherBooks.genre
     CASE WHEN sharedBooks.genre = otherBooks.genre THEN otherBooks.BayesianAverage * 100 ELSE otherBooks.BayesianAverage END AS Score
RETURN otherBook.title AS RecTitle, Score

//RETURN otherBooks.title AS BookTitle, otherBooks.BayesianAverage AS BayesianAvg, otherBooks.genre AS Genre, sharedBooks.genre, g.name AS readBook_Genre, Score //, collect(DISTINCT sharedBooks.title, )
""",
    params={"USER_NAME": USER_NAME}
)

WITH otherBooks.title AS RecTitle, otherBooks.BayesianAverage AS initialScore,
     CASE WHEN SharedBookGenre = OtherBookGenre 
          THEN otherBooks.BayesianAverage * 100 
          ELSE otherBooks.BayesianAverage 
     END AS Score

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'CASE': expected
  "!="
  "%"
  "*"
  "+"
  ","
  "-"
  "/"
  "::"
  "<"
  "<="
  "<>"
  "="
  "=~"
  ">"
  ">="
  "AND"
  "AS"
  "CALL"
  "CONTAINS"
  "CREATE"
  "DELETE"
  "DETACH"
  "ENDS"
  "FOREACH"
  "IN"
  "IS"
  "LIMIT"
  "LOAD"
  "MATCH"
  "MERGE"
  "OPTIONAL"
  "OR"
  "ORDER"
  "REMOVE"
  "RETURN"
  "SET"
  "SKIP"
  "STARTS"
  "UNION"
  "UNWIND"
  "USE"
  "WHERE"
  "WITH"
  "XOR"
  "^"
  <EOF> (line 7, column 6 (offset: 275))
"     CASE WHEN sharedBooks.genre = otherBooks.genre THEN otherBooks.BayesianAverage * 100 ELSE otherBooks.BayesianAverage END AS Score"
      ^}

In [39]:
USER_NAME

'Marten Højlund'

In [26]:
# Filter the DataFrame
filtered_df = ratings_df[ratings_df['BookTitle'].isin(book_list)].reset_index()

# Display the filtered DataFrame
filtered_df.head(10)

NameError: name 'ratings_df' is not defined

In [None]:
MATCH (userA:User {name: 'Gunnur Grønkjær'})-[:LIKED]->(sharedBooks:Book)<-[:LIKED]-(otherUser:User)
WITH DISTINCT otherUser, sharedBooks 
MATCH (otherUser) -[:LIKED]->(otherBooks:Book) -[]->(g:Genre)
WHERE sharedBooks <> otherBooks
RETURN otherBooks.title AS recTitle, otherBooks.BayesianAverage AS BayesianAvg
, otherBooks.genre AS Genre, sharedBooks.genre, g.name //, collect(DISTINCT sharedBooks.title, )

In [None]:

def calculate_bayesian_average(data):
    # Creating DataFrame
    df = pd.DataFrame(data[0], columns=["item", "avg_rating", "num_ratings"])

    # Calculate global average rating (M) and typical number of ratings per book (C)
    global_average_rating = df['avg_rating'].mean()
    print("global_average_rating", global_average_rating)
    typical_num_ratings = df['num_ratings'].mean()
    print("typical_num_ratings", typical_num_ratings)
    # Apply Bayesian average formula to each book
    df['BayesianAverage'] = ((typical_num_ratings * global_average_rating) + (df['avg_rating'] * df['num_ratings'])) / (typical_num_ratings + df['num_ratings'])
    
    # Sort the DataFrame by BayesianAverage and then convert to dictionary
    sorted_df = df.sort_values(by='BayesianAverage', ascending=False)
    bayesian_average_dict = sorted_df.set_index('item')['BayesianAverage'].to_dict()
    
    return bayesian_average_dict
#

In [None]:
book_ratings = db.cypher_query(
"""
MATCH (book:Book)<-[r:RATED]-(:User)
WITH book.title AS BookTitle, AVG(r.rating) AS AverageRating, COUNT(r) AS TotalRatings
RETURN BookTitle, AverageRating, TotalRatings
ORDER BY BookTitle
"""
)
# Calculate Bayesian averages
bayesian_df = calculate_bayesian_average(data=book_ratings)
bayesian_df
df = pd.DataFrame(book_ratings[0], columns=["item", "avg_rating", "num_ratings"])


In [None]:
genre_ratings = db.cypher_query(
"""
MATCH (genre:Genre)<-[:BELONGS_TO]-(book:Book)<-[r:RATED]-(:User)
WITH genre.name AS GenreName, AVG(r.rating) AS AverageRating, COUNT(r) AS TotalRatings
RETURN GenreName, AverageRating, TotalRatings
ORDER BY GenreName
"""
)
# Calculate Bayesian averages
bayesian_df = calculate_bayesian_average(data=genre_ratings)
bayesian_df

In [None]:
author_ratings = db.cypher_query(
"""
MATCH (author:Author)<-[:WRITTEN_BY]-(book:Book)<-[r:RATED]-(:User)
WITH author.name AS AuthorName, AVG(r.rating) AS AverageRating, COUNT(r) AS TotalRatings
RETURN AuthorName, AverageRating, TotalRatings
ORDER BY AuthorName

"""
)
# Calculate Bayesian averages
bayesian_df = calculate_bayesian_average(data=author_ratings)
bayesian_df

In [None]:
def get_random_user_name():
        result = db.cypher_query(
            """
            MATCH (user:User)
            WITH COLLECT(user.name) AS names
            RETURN apoc.coll.randomItem(names) AS RandomUserName
        """)
        return result[0][0][0]

In [None]:
USER_NAME = get_random_user_name()
USER_NAME

In [None]:
USER_NAME

In [None]:

book_list = db.cypher_query(
    """
    MATCH (userA:User {name: $USER_NAME})-[:LIKED]->(sharedBook:Book)
    MATCH (otherUser:User)-[:READ]->(sharedBook)
    WHERE userA <> otherUser
    WITH DISTINCT otherUser
    MATCH (otherUser)-[:LIKED]->(book:Book)
    WITH DISTINCT book
    RETURN COLLECT(book.title) AS UniqueBooksReadByOthers
    """,
    params={"USER_NAME": USER_NAME}
)[0][0][0]
book_list

In [None]:
# Existing dictionary from calculate_bayesian_average function
bayesian_average_dict = calculate_bayesian_average(data=book_ratings)  # Assuming data is already defined

# Filter the dictionary to keep only the keys present in filter_list
filtered_dict = {key: val for key, val in bayesian_average_dict.items() if key in book_list}
df_user = pd.DataFrame(list(filtered_dict.items()), columns=['BookTitle', 'Score'])

df_user

In [None]:
# Calculate average reads per book

db.cypher_query(
"""
// Step 1: Calculate total reads and book count for each genre
MATCH (book:Book)
OPTIONAL MATCH (book)<-[read:READ]-(:User)
WITH COUNT(DISTINCT read) AS Reads, COUNT(DISTINCT book) AS BookCount
            MATCH (book:Book)
            SET book.AverageReadsPerBook = Reads/BookCount
RETURN Reads/BookCount AS AverageReadsPerBook
"""
)

In [None]:
# Calculate total reads and book count for each genre

db.cypher_query(
"""
// Step 1: Calculate total reads and book count for each genre
MATCH (genre:Genre)<-[:BELONGS_TO]-(book:Book)
OPTIONAL MATCH (book)<-[read:READ]-(:User)
WITH genre, COUNT(DISTINCT book) AS BookCount, COUNT(DISTINCT read) AS Reads

// Step 2: Calculate the average reads per book for each genre
WITH genre, Reads, BookCount, CASE WHEN BookCount > 0 THEN toFloat(Reads) / BookCount ELSE 0 END AS AverageReadsPerBook

// Step 3: Update the genre nodes with the calculated average
SET genre.AverageReadsPerBook = AverageReadsPerBook
RETURN genre.name AS Genre, Reads, BookCount, genre.AverageReadsPerBook
ORDER BY genre.AverageReadsPerBook DESC
"""
)

In [None]:
# Calculate total reads and book count for each author

db.cypher_query(
"""
// Step 1: Calculate total reads and book count for each author
MATCH (author:Author)-[:WRITTEN_BY]-(book:Book)
OPTIONAL MATCH (book)<-[read:READ]-(:User)
WITH author, COUNT(DISTINCT book) AS BookCount, COUNT(DISTINCT read) AS Reads

// Step 2: Calculate the average reads per book for each author
WITH author, Reads, BookCount, CASE WHEN BookCount > 0 THEN toFloat(Reads) / BookCount ELSE 0 END AS AverageReadsPerBook

// Step 3: Update the author nodes with the calculated average
SET author.AverageReadsPerBook = AverageReadsPerBook
RETURN author.name AS Author, Reads, BookCount, author.AverageReadsPerBook
"""
)


In [None]:
# set book likeability propperty

db.cypher_query(
"""
MATCH (book:Book)
OPTIONAL MATCH (book)<-[like:LIKED]-(:User)
WITH book, COUNT(DISTINCT like) AS Likes
OPTIONAL MATCH (book)<-[hate:HATE]-(:User)
WITH book, Likes, COUNT(DISTINCT hate) AS Hates
OPTIONAL MATCH (book)<-[read:READ]-(:User)
WITH book, Likes, Hates, COUNT(DISTINCT read) AS Reads
WITH book, Likes, Hates, Reads, 
     CASE 
         WHEN Reads > 0 THEN ROUND(100 * ((book.AverageReadsPerBook * 0.5) + toFloat(Likes)) / (book.AverageReadsPerBook + toFloat(Likes))) / 100 
         ELSE 0 
     END AS LikeReadRatio
SET book.likeability = LikeReadRatio
RETURN book.title AS BookTitle, 
       Likes AS LikesBook, 
       Hates AS HatesBook, 
       Reads AS TotalReactionsBook,
       LikeReadRatio AS LikeReadRatioBook
    ORDER BY LikeReadRatio DESC
"""
)

In [None]:
# set genre likeability propperty

db.cypher_query(
"""
MATCH (genre:Genre)<-[:BELONGS_TO]-(book:Book)
OPTIONAL MATCH (book)<-[like:LIKED]-(:User)
WITH genre, book, COUNT(like) AS Likes
OPTIONAL MATCH (book)<-[hate:HATE]-(:User)
WITH genre, book, Likes, COUNT(hate) AS Hates
OPTIONAL MATCH (book)<-[read:READ]-(:User)
WITH genre, book, Likes, Hates, COUNT(read) AS Reads
WITH genre, COLLECT({book: book.title, likes: Likes, hates: Hates, reads: Reads}) AS Books
UNWIND Books AS bookDetails
WITH genre, 
     SUM(bookDetails.likes) AS TotalLikes, 
     SUM(bookDetails.hates) AS TotalHates,
     SUM(bookDetails.reads) AS TotalReads
WITH genre, TotalLikes, TotalHates, TotalReads,
     CASE 
         WHEN TotalReads > 0 
         THEN ROUND(100 * ((genre.AverageReadsPerBook * 0.5) + toFloat(TotalLikes)) / (genre.AverageReadsPerBook + toFloat(TotalLikes))) / 100 
         ELSE 0 
     END AS LikeReadRatio
SET genre.likeability = LikeReadRatio
RETURN genre.name AS Genre, 
       TotalLikes AS TotalLikesGenre, 
       TotalHates AS TotalHatesGenre,
       TotalReads AS TotalReadsGenre,
       LikeReadRatio AS LikeReadRatioGenre
ORDER BY LikeReadRatio DESC
"""
)

In [None]:
# set author likeability propperty

db.cypher_query(
       """
MATCH (author:Author)<-[:WRITTEN_BY]-(book:Book)
OPTIONAL MATCH (book)<-[like:LIKED]-(:User)
WITH author, book, COUNT(DISTINCT like) AS Likes
OPTIONAL MATCH (book)<-[hate:HATE]-(:User)
WITH author, book, Likes, COUNT(DISTINCT hate) AS Hates
OPTIONAL MATCH (book)<-[read:READ]-(:User)
WITH author, book, Likes, Hates, COUNT(DISTINCT read) AS Reads
WITH author, SUM(Likes) AS TotalLikes, SUM(Hates) AS TotalHates, SUM(Reads) AS TotalReads
WITH author, TotalLikes, TotalHates, TotalReads,
     CASE 
         WHEN TotalReads > 0 
         THEN ROUND(100 * (author.AverageReadsPerBook * toFloat(TotalLikes)) / (author.AverageReadsPerBook + toFloat(TotalLikes))) / 100 
         ELSE 0
     END AS LikeReadRatio
SET author.likeability = LikeReadRatio
RETURN author.name AS AuthorName, 
       TotalLikes AS TotalLikesAuthor, 
       TotalHates AS TotalHatesAuthor,
       TotalReads AS TotalReadsAuthor,
       LikeReadRatio AS LikeReadRatioAuthor
// Add an ORDER BY clause if needed, for example:
// ORDER BY LikeReadRatioAuthor DESC
"""
)

In [None]:
# # set book likeability propperty

# db.cypher_query(
# """
# MATCH (book:Book)
# OPTIONAL MATCH (book)<-[like:LIKED]-(:User)
# WITH book, COUNT(DISTINCT like) AS Likes
# OPTIONAL MATCH (book)<-[hate:HATE]-(:User)
# WITH book, Likes, COUNT(DISTINCT hate) AS Hates
# OPTIONAL MATCH (book)<-[read:READ]-(:User)
# WITH book, Likes, Hates, COUNT(DISTINCT read) AS Reads
# WITH book, Likes, Hates, Reads, 
#      ROUND(100 * CASE WHEN Reads > 0 THEN toFloat(Likes) / Reads ELSE 0 END) / 100 AS LikeReadRatio
# SET book.likeability = LikeReadRatio
# RETURN book.title AS BookTitle, 
#        Likes, 
#        Hates, 
#        (Likes + Hates) AS TotalReactions,
#        LikeReadRatio
# """
# )

In [None]:
# set UserAuthor likeability propperty
mapper = 2
db.cypher_query(
"""
MATCH (user:User)-[:READ]->(book)-[:WRITTEN_BY]->(author:Author)
OPTIONAL MATCH (user)-[like:LIKED]->(book)
OPTIONAL MATCH (user)-[hate:HATE]->(book)
WITH user, author, 
     COUNT(DISTINCT like) AS Likes, 
     COUNT(DISTINCT hate) AS Hates
WITH user, 
     COLLECT({
       author: author.name, 
       likeRatio: CASE 
                    WHEN (Likes + Hates) > 0 
                    THEN ROUND((100 * toFloat(Likes) / (Likes + Hates)) * $mapper) / 100 
                    ELSE 0 
                  END
     }) AS AuthorLikeRatios
SET user.authorLikeRatios = apoc.convert.toJson(AuthorLikeRatios)
""",
params={"mapper": mapper}
)


In [None]:
# set UserGenre likeability propperty

db.cypher_query(
"""
MATCH (user:User)-[:READ]->(book)-[:BELONGS_TO]->(genre:Genre)
OPTIONAL MATCH (user)-[like:LIKED]->(book)
OPTIONAL MATCH (user)-[hate:HATE]->(book)
WITH user, genre, 
     COUNT(DISTINCT like) AS Likes, 
     COUNT(DISTINCT hate) AS Hates
WITH user, 
     COLLECT({genre: genre.name, likeRatio: CASE 
             WHEN (Likes + Hates) > 0 THEN ROUND((100 * toFloat(Likes) / (Likes + Hates)) * $mapper) / 100 
             ELSE 0 
         END}) AS GenreLikeRatios
SET user.genreLikeRatios = apoc.convert.toJson(GenreLikeRatios)
""",
params={"mapper": mapper}
)


In [None]:
# bookscore for new user

data = db.cypher_query(
"""
MATCH (book:Book)-[:BELONGS_TO]->(genre:Genre)
MATCH (book)-[:WRITTEN_BY]->(author:Author)
RETURN book.title AS BookTitle,
       genre.name AS BookGenre,
       author.name AS AuthorName,
       ROUND(100 * (COALESCE(book.likeability, 0) + COALESCE(genre.likeability, 0) + COALESCE(author.likeability, 0))) / 100 AS TotalLikeabilityScore
ORDER BY TotalLikeabilityScore DESC
"""
)[0]

In [None]:
def get_random_user_name():
        result = db.cypher_query("""
            MATCH (user:User)
            WITH COLLECT(user.name) AS names
            RETURN apoc.coll.randomItem(names) AS RandomUserName
        """)
        return result[0][0][0]

In [None]:

# Rekommendation for user:

USER_NAME = 'Hasten Koefoed'

USER_NAME = get_random_user_name()
MATCH (user:User {{name: '{USER_NAME}'}})

In [None]:
'Helgha Madsen'

In [None]:

query_string = f"""
MATCH (user:User {{name: '{USER_NAME}'}})
WITH user, 
     apoc.convert.fromJsonList(user.authorLikeRatios) AS authorRatiosList,
     apoc.convert.fromJsonList(user.genreLikeRatios) AS genreRatiosList

UNWIND authorRatiosList as authorRatios
UNWIND genreRatiosList as genreRatios

MATCH (book:Book)-[:BELONGS_TO]->(genre:Genre)
MATCH (book)-[:WRITTEN_BY]->(author:Author)
WHERE NOT (user)-[:READ]->(book)
WITH book, 
     genre, 
     author, 
     CASE WHEN author.name = authorRatios.author THEN authorRatios.likeRatio ELSE 1 END AS authorLikeRatio,
     CASE WHEN genre.name = genreRatios.genre THEN genreRatios.likeRatio ELSE 1 END AS genreLikeRatio
RETURN book.title AS BookTitle,
       genre.name AS BookGenre,
       author.name AS AuthorName,
       ROUND(100 * (COALESCE(book.likeability, 0) + COALESCE(genre.likeability, 0) * genreLikeRatio + COALESCE(author.likeability, 0) * authorLikeRatio)) / 100 AS TotalLikeabilityScore
ORDER BY TotalLikeabilityScore DESC
"""

# Execute the query
user_data = db.cypher_query(query_string)[0]


In [None]:
query_string = f"""
MATCH (user:User {{name: '{USER_NAME}'}})
OPTIONAL MATCH (user)-[:READ]->(book:Book)
RETURN user.authorLikeRatios, user.genreLikeRatios, COLLECT(book.title) AS ReadBooks
"""

# Execute the query
results = db.cypher_query(query_string)[0][0]
print("Author Like Ratios:", results[0])
print("Genre Like Ratios:", results[1])
print("Books Read by User:", results[2])

In [None]:
df = pd.DataFrame(data, columns=['BookTitle', 'Genre', 'Author', 'Score'])

In [None]:

df_user = pd.DataFrame(user_data, columns=['BookTitle', 'Genre', 'Author', 'Score_user'])

In [None]:
# Merging the dataframes on 'BookTitle', 'Genre', and 'Author'
merged_df = df.merge(df_user, on=['BookTitle', 'Genre', 'Author'])

# Assuming merged_df is your merged DataFrame
sorted_df = merged_df.sort_values(by='Score_user', ascending=False)

sorted_df.head(30)