In [None]:
# PART 1
import requests #We are using this library to request the content of a web page by sending a GET request to the provided URL
from bs4 import BeautifulSoup #we are using this as it creates parse trees that basically organise and almost filter the data of the website that we are trying to extract from
import pandas as pd # we are using it to organise the data from the recipe to store present into table columns
import json #this will allows us to take certain information from recipe that is basically easy for the program use 

# We are configure pandas display options to ensure all columns and rows are displayed without being cut short
pd.set_option('display.max_columns', None)# this ensures that no matter how many columns there are every single one will be shown 
pd.set_option('display.width', None)#this makes sure that each line per row in that table will be given enough room to display the data
pd.set_option('display.max_colwidth', None)#this makes sure that all the information will be displayed no matter how long it is
pd.set_option('display.max_rows', None)# this basically makes sure all rows are displayed as some rows maybe longer than others

def collect_page_data(url):# our defined function 
    # we are extracting the data from the webpage and we are storing it in the response variable
    response = requests.get(url)
    # we are using to get the data from the webpage and have it in a structure it in a way that is easy to navigate 
    soup = BeautifulSoup(response.content, 'html.parser')
    
    script = soup.find('script', {'type': 'application/ld+json'}) # by finding this we would find the information necessary about the webpage content

    data = json.loads(script.string) if script else {} # if the piece of code isn't there we are just creating an empty dictionary and still allow the code to run
    
    # We are extracting the recipe's specific details like the title ,cooking time,image etc.If for some reason the data is not availiable for the category we need it will say that it is not specified
    title = data.get('name', 'Not specified')
    # We will add the cooking and preparation times after removing the 'PT'(period time)
    total_time = data.get("cookTime").strip("PT") + data.get("prepTime").strip("PT")
    # We are extracting the URL of the image and the filename of the image.
    image_url = data.get('image', ['No image'])[0]
    image = image_url.split('/')[-1] if image_url != 'No image' else 'No image'
    # we are adding the list of ingredients into a single string and we are seprating using commas
    ingredients_list = data.get('recipeIngredient', [])
    ingredients = ', '.join(ingredients_list) if ingredients_list else 'Not specified'
    # We are extracting the rating value and count.
    rating_val = data.get('aggregateRating', {}).get('ratingValue', 'Not available')
    rating_count = data.get('aggregateRating', {}).get('ratingCount', 'Not available')
    # We are extracting the category and cuisine type.
    category = data.get('recipeCategory', 'Not specified')
    cuisine = data.get('recipeCuisine', 'Not specified')

    # We are extracting the list of suitable diets, formatted as webpages, and determine whether they are vegan/vegetarian 
    diet_list = data.get('suitableForDiet', [])
    diet = ', '.join([d.split('/')[-1] for d in diet_list]) if diet_list else 'Not specified'
    # We are comparing the list to the VeganDiet/VegetarianDiet webpages to set the vegan/vegetarian 
    vegan = 'Yes' if 'http://schema.org/VeganDiet' in diet_list else 'No'
    vegetarian = 'Yes' if 'http://schema.org/VegetarianDiet' in diet_list else 'No'
    
    # Construct a DataFrame from the extracted data and we are creating a table with the raw data obttained by the webpage
    #we are creating a dictionary with all of the necessary information and each infromation corresponds to a different key
    data_df = {
        'title': [title],
        'total_time': [total_time], 
        'image': [image],
        'ingredients': [ingredients],
        'rating_val': [rating_val],
        'rating_count': [rating_count],
        'category': [category],
        'cuisine': [cuisine],
        'diet': [diet],
        'vegan': [vegan],
        'vegetarian': [vegetarian],
        'url': [url]
    }

    # We are making a dataframe from the dictionary and presenting it in a table format
    df = pd.DataFrame(data_df)

    # Return the dataframe containing the recipe's data.
    return df

# We are calling the function with the specific webpage and store the result in 'df' table.
url = 'https://www.bbc.co.uk/food/recipes/easiest_ever_banana_cake_42108'
df = collect_page_data(url)

# Printing the dataframe
df

: 

In [None]:
# PART 2 QUESTION 1
import pandas as pd
import numpy as np
from sklearn.utils import resample

#load the content of both files and combine them into one dataframe
df1 = pd.read_csv("books_new.csv")
# df1

df2 = pd.read_csv("ratings.csv")
# df2

combined_df = pd.merge(df1,df2, on='bookId')
print(combined_df)

#show summary statistics
summary_statistics = combined_df.describe() #shows summary statistics
print(summary_statistics)

#handle missing values
missing_values = combined_df.isnull().sum() #handle missing values
print(missing_values)

combined_df.dropna(inplace=True)

In [None]:
# PART 2 QUESTION 2
import pandas as pd
import numpy as np
from sklearn.utils import resample

#calculate the average rating for each book and show top 10 books with highest ratings
average_ratings = combined_df.groupby('bookId')['rating'].mean()#.sort_values(ascending=False)
top10_ratings = average_ratings.nlargest(10)
print(top10_ratings)
#print(average_ratings.head(10))

#compute a 95% confidence interval for the average ratings
def bootstrap_mean(data, n_samples=1000, sample_size=100):
    bootstrap_mean = np.empty(n_samples)
    np.random.seed(42)
    for i in range(n_samples):
        bootstrap_sample = np.random.choice(data, size=sample_size)#combined_df['rating'].sample(n=sample_size, replace=True)
        bootstrap_mean[i] = np.mean(bootstrap_sample)#.append(bootstrap_sample.mean())
    return np.percentile(bootstrap_mean, [2.5, 97.5])

confidence_intervals = {}
for book_id, group in combined_df.groupby('bookId'):
    confidence_intervals[book_id] = bootstrap_mean(group['rating'].values, 1000, 100)

# Print confidence intervals for the top 10 highest average ratings
for book_id, conf_interval in list(confidence_intervals.items())[:10]:
    print(f"\nBook: {book_id}, 95% Confidence Interval for the Average Ratings: ({conf_interval})")

In [None]:
# PART 2 QUESTION 3
# include an extra column called rating count and analyze the relationship
rating_count = combined_df.groupby('bookId')['rating'].count().reset_index()
rating_count.columns = ['bookId', 'ratingCount']
combined_df = pd.merge(combined_df, rating_count, on='bookId')

#analyze the relationship between average rating and rating count
correlation = combined_df['rating'].corr(combined_df['ratingCount'])
print("\nCorrelation between Average Rating and Rating Count:", correlation)

# Suggest a threshold for the number of ratings under which the rating can be considered as not significant
threshold = 50
significant_ratings = combined_df[combined_df['ratingCount'] < threshold]

# Print results
print("\nBooks with Rating Count Less Than Threshold:")
print(significant_ratings)

In [None]:
# PART 2 QUESTION 4

import pandas as pd # we are using it to organise the data from the recipe to store present into table columns
df1 = pd.read_csv("books_new.csv") #it reads books_new.csv and the contents of it will be shown on df1
df2 = pd.read_csv("ratings.csv") #it reads ratings.csv and the contents of it will be shown on df2
combined_df = pd.merge(df1,df2, on='bookId')# we are creating a new data frame where it will combine df1 & df2 based on the bookId as both cs files have those in common


from sklearn.feature_extraction.text import CountVectorizer #we are using the CountVectorizer to convert text data in numerical data
from sklearn.metrics.pairwise import cosine_similarity#we are using the cosine_similarity to see the similarity between the books based on combined features

# we are converting the rating column from the combined_df into a binary value and we called it "like_dislike"
#with 1 being a like only if the rating is =>3.6 otherwise it will be a dislike(-1)
combined_df['like_dislike'] = combined_df['rating'].apply(lambda x: 1 if x >= 3.6 else -1)

# we are selecting the specific features of the book like the title,author etc.
features_to_combine = ['Title', 'Author', 'SubGenre', 'Publisher']

#this is to fill in the missing values in those columns with empty slots to those without have a nan(not a number)
# this is to make sure that if there is a certain information missed from any books then fill in the blank with a blank space
for feature in features_to_combine:
    combined_df[feature] = combined_df[feature].fillna('')

# we are combining the features 
combined_df['combined_features'] = combined_df[features_to_combine].agg(' '.join, axis=1)



# We are setting up CountVectorizer we are in this case just converting text data into numerical data
vectorizer = CountVectorizer()

# Fit and transform the combined_features to a matrix of token counts
#we are taking all of the strings count how many times it appears the words appears in the combined features 
count_matrix = vectorizer.fit_transform(combined_df['combined_features'])

# Compute the cosine similarity matrix from the count matrix
#this calculates how similar each book is to one another  using the cosine of the angle of the books 
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# prints the cosine similarity 
print(cosine_sim)

In [None]:
# PART 2 QUESTION 5

from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def vec_space_method(query_book, dataset):
    #initializing Vectorizer and transforming the text data to TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['Title'])
   
    #Locating the dataset's query book index.
    query_index = dataset[dataset['Title'] == query_book].index[0]

    #Calculating the cosine Similarity between the book and all other books
    cosine_similarities = linear_kernel(tfidf_matrix[query_index], tfidf_matrix).flatten()

    #Obtain the top ten most similar book indexes (without including the query book itself).
    similar_indices = cosine_similarities.argsort()[:-11:-1]

    #Get the most similar books' titles and commonalities.
    similar_books = [(dataset.iloc[i]['Title'], cosine_similarities[i]) for i in similar_indices if i != query_index]

    return similar_books


books = pd.read_csv('books_new.csv')

query_book = 'Girl with the Dragon Tattoo'
similar_books = vec_space_method(query_book, books)

print("10 most similar books to '{}' are:".format(query_book))
for book, similarity in similar_books:
    print("\n-{}, Similarity: {:.2f}".format(book, similarity))

In [None]:
# PART 2 QUESTION 6
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def knn_similarity(query_book, dataset, k=10):
    # Initializing Vectorizer and transforming the text data to TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['Title'])

    # Locating the dataset's query book index
    query_index = dataset[dataset['Title'] == query_book].index[0]

    # Initializing KNN model
    knn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn_model.fit(tfidf_matrix)

    # Finding k nearest neighbors for the query book
    distances, indices = knn_model.kneighbors(tfidf_matrix[query_index], n_neighbors=k+1)

    # Get the most similar books' titles and similarities
    similar_books = [(dataset.iloc[idx]['Title'], 1 - distance) for distance, idx in zip(distances.flatten()[1:], indices.flatten()[1:])]

    return similar_books

# Example usage
books = pd.read_csv('books_new.csv')
query_book = 'Girl with the Dragon Tattoo'
similar_books = knn_similarity(query_book, books)

print(f"10 most similar books to '{query_book}' are:")
for book, similarity in similar_books:
    print(f"\n- {book}, Similarity: {similarity:.2f}")

In [None]:
#Question 7

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Function to clean the data
def clean_data(bk):
    bk.dropna(axis=0, inplace=True) # Delete rows with missing values
    bk.dropna(axis=1, how='all', inplace=True) # Delete columns with all missing values
    return bk

# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2)

# Function to calculate coverage
def calculate_coverage(recommendations, total_books):
    unique_books = set(recommendations)
    coverage = len(unique_books) / total_books * 100
    return coverage

# Function to calculate personalization
def calculate_personalization(recommendations):
    num_users = len(recommendations)
    num_unique_books = len(set(book for user_rec in recommendations for book in user_rec))
    personalization = 1 - (num_unique_books / (num_users * len(recommendations[0])))
    return personalization

# Function to perform KNN similarity
def knn_similarity(query_book, dataset, k=10):
    # Initializing Vectorizer and transforming the text data to TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['Title'])

    # Locating the dataset's query book index
    query_index = dataset[dataset['Title'] == query_book].index[0]

    # Initializing KNN model
    knn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn_model.fit(tfidf_matrix)

    # Finding k nearest neighbors for the query book
    distances, indices = knn_model.kneighbors(tfidf_matrix[query_index], n_neighbors=k+1)

    # Get the most similar books' titles and similarities
    similar_books = [dataset.iloc[idx]['Title'] for idx in indices.flatten()[1:]]

    return similar_books

# Read the datasets
books_df = pd.read_csv('books_new.csv')

# Clean the book dataset
books_df = clean_data(books_df)

# Initialize the tfidf_vectorizer and tfidf_matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books_df['Title'])

# Define user preferences test set
user_preferences_test = {
    1: 'Fundamentals of Wavelets', 
    2: 'Orientalism', 
    3: 'How to Think Like Sherlock Holmes', 
    4: 'Data Scientists at Work'
}

# Initialize lists to store recommendations for each user from both methods
vector_space_recommendations = []
knn_recommendations = []

# Initialize KNN model
knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
knn_model.fit(tfidf_matrix)

# Iterate over each user and their preferred book
for user_id, preferred_book in user_preferences_test.items():
    
    # Get recommendations using the vector space method for the preferred book
    query_index = books_df[books_df['Title'] == preferred_book].index[0]
    distances, indices = knn_model.kneighbors(tfidf_matrix[query_index])
    similar_books = [books_df.iloc[idx]['Title'] for idx in indices.flatten()[1:]]
    vector_space_recommendations.append(similar_books)

    # Get recommendations using the KNN method for the preferred book
    knn_rec = knn_similarity(preferred_book, books_df)
    knn_recommendations.append(knn_rec)

# Calculate Coverage for Vector Space Method
total_unique_books = len(books_df)  # Total number of unique books in the dataset
vector_space_coverage = calculate_coverage([book for sublist in vector_space_recommendations for book in sublist], total_unique_books)

# Calculate Coverage for KNN Method
knn_coverage = calculate_coverage([book for sublist in knn_recommendations for book in sublist], total_unique_books)

# Print Coverages for each model
print("Vector Space Method Coverage:", vector_space_coverage, "%")
print("KNN Method Coverage:", knn_coverage, "%")

# Calculate Personalization for Vector Space Method
personalization_vector_space = calculate_personalization(vector_space_recommendations)

# Calculate Personalization for KNN Method
personalization_knn = calculate_personalization(knn_recommendations)

# Print Personalization Scores
print("Vector Space Method Personalization:", personalization_vector_space)
print("KNN Method Personalization:", personalization_knn)

# Analysis Comments: # Review of Coverage: # The coverage of each approach varies. In comparison to the KNN Method, the Vector Space Method covers a larger spectrum of unique publications. This implies that a wider range of recommendations are provided by the Vector Space Method for different consumers.

# Evaluation of Personalisation: # When compared to the KNN Method, the Vector Space Method shows more personalisation. This suggests that there is less overlap between recommended lists for various users since the recommendations made by the Vector Space Method are more individualised to each user's interests. On the other hand, the KNN Method displays less personalisation, indicating that each user's recommendations are not as customised.

In [None]:
#Question 8

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Your existing code
# Read the datasets
bk = pd.read_csv('books_new.csv')
rt = pd.read_csv('ratings.csv')

# Function to clean the data
def clean_data(bk):
    bk.dropna(axis=0, inplace=True) # Delete rows with missing values
    bk.dropna(axis=1, how='all', inplace=True) # Delete columns with all missing values
    return bk

# Clean the book dataset
bk = clean_data(bk)

# Perform one-hot encoding on categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
bk_encoded = encoder.fit_transform(bk[['Title', 'Author', 'Genre']])  # Encode 'title', 'author', and 'genre' columns
encoded_columns = encoder.get_feature_names_out(['Title', 'Author', 'Genre'])
bk_encoded = pd.DataFrame(bk_encoded, columns=encoded_columns)

# Merge datasets based on a common key (e.g., book ID)
merged_data = pd.concat([bk_encoded, rt], axis=1, join='inner')

# Preprocess the merged dataset and prepare features and target variable
X = merged_data.drop(columns=['user_id', 'rating'])  # Features
y = merged_data['rating']  # Target variable

# Train the KNN model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
knn_model.fit(X)

# Function to predict whether a user would like a book
def predict_like(query_book_features):
    _, indices = knn_model.kneighbors(query_book_features)
    nearest_neighbor_ratings = y.iloc[indices.flatten()]  # Ratings of nearest neighbors
    
    # Predict whether the user would like the query book
    prediction = np.mean(nearest_neighbor_ratings) >= 3  # If mean rating is 3 or higher, predict 'like', else 'dislike'
    
    return prediction

# Reset the index of X DataFrame
X.reset_index(drop=True, inplace=True)

# Iterate over each unique user ID in the ratings dataset and make predictions
unique_user_ids = rt['user_id'].unique()
for user_id in unique_user_ids:
    # Get all books rated by the current user
    user_books = rt[rt['user_id'] == user_id]['bookId']
    
    # Iterate over each book rated by the user and make predictions
    for book_id in user_books:
        query_book_features = X[X.index == book_id]
        
        # Check if any matching books are found
        if not query_book_features.empty:
            prediction = predict_like(query_book_features)
            book_title = bk.loc[bk['bookId'] == book_id, 'Title'].values
            if book_title:
                print(f"Prediction for book '{book_title[0]}' by user '{user_id}': {prediction}")
            else:
                print(f"No title found for book ID '{book_id}'")
        else:
            print(f"No matching book found for user '{user_id}' and book ID '{book_id}'.")