In [71]:
import numpy as np
import pandas as pd
import re


from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import NMF

from sklearn.metrics import mean_squared_error

In [72]:
def clean_csv_data(input_csv, output_csv="cleaned_data.csv"):
    '''
    Args:
        input_csv (str): Path to the uncleaned CSV file.
        output_csv (str): Path to save the cleaned data (default: "cleaned_data.csv").
    Returns:
        pd.DataFrame: A cleaned DataFrame with pivoted categories.
    '''
    df = pd.read_csv(input_csv)
    
    # Ensure required columns exist
    required_columns = ["book_name", "author_name", "category_name", "category_description", "categories_data"]
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"Missing required column: {col}")
    
    # Fill missing values and convert to string
    df["category_description"] = df["category_description"].fillna("").astype(str)
    df["categories_data"] = df["categories_data"].fillna("").astype(str)
    
    # Clean and split categories
    df["category_description"] = df["category_description"].str.replace("\n", "", regex=True).str.strip().str.split(", ")
    df["categories_data"] = df["categories_data"].str.replace("\n", "", regex=True).str.replace("%", "").str.strip().str.split(", ")
    
    # Ensure lists have matching lengths
    def adjust_lengths(row):
        names, values = row["category_description"], row["categories_data"]
        if isinstance(names, list) and isinstance(values, list):
            min_len = min(len(names), len(values))
            return names[:min_len], values[:min_len]
        return names, values
    
    df[["category_description", "categories_data"]] = df.apply(adjust_lengths, axis=1, result_type="expand")
    
    # Explode the DataFrame to make each category a row
    df_rows = df.explode(["category_description", "categories_data"])
    
    # Convert category data to numeric values
    df_rows["categories_data"] = pd.to_numeric(df_rows["categories_data"], errors="coerce")
    
    # Aggregate duplicates by taking the mean
    df_rows = df_rows.groupby(["book_name", "author_name", "category_description"], as_index=False).mean(numeric_only=True)
    
    # Pivot the categories into columns
    df_pivot = df_rows.pivot(index=['book_name', 'author_name'], columns="category_description", values="categories_data")
    
    # Reset index to bring "book_name" and "author_name" back as columns
    df_pivot.reset_index(inplace=True)
    
    # Fill NaN values with 0
    df_pivot.fillna(0, inplace=True)
    
    # Save to CSV
    df_pivot.to_csv(output_csv, index=False)
    
    return df_pivot

V = clean_csv_data("output_new.csv")
df = V
df

category_description,book_name,author_name,Accurate,Accurate.1,Action,Actionable,Adult themes,Adult themes.1,Advanced,Angry,...,Meandering,Medium,Medium paced,Mystery,Neutral,Recommend,Romantic,Slow paced,and figures,diagrams
0,A Christmas carol,\nCharles Dickens\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Adventures of Huckleberry Finn,\nMark Twain\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Adventures of Sherlock Holmes,\nArthur Conan Doyle\n\n,0.0,33.0,12.0,33.0,0.0,14.0,0.0,12.0,...,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,50.0,16.0
3,Animal Farm,\nGeorge Orwell\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Charlie and the Chocolate Factory,\nRoald Dahl\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Charlie and the Great Glass Elevator,\nRoald Dahl\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Cinquante nuances de Grey,\nE. L. James\n\n,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Daddy-Long-Legs,\nJean Webster\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,El Principito / The Little Prince,\nAntoine de Saint-Exupéry\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Emma,\nJane Austen\n\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
import pandas as pd

def clean_csv_data(input_csv, output_csv="cleaned_data.csv"):
    '''
    Args:
        input_csv (str): Path to the uncleaned CSV file.
        output_csv (str): Path to save the cleaned data (default: "cleaned_data.csv").
    Returns:
        pd.DataFrame: A cleaned DataFrame with pivoted categories.
    '''
    df = pd.read_csv(input_csv)
    
    # Ensure required columns exist
    required_columns = ["book_name", "author_name", "category_name", "category_description", "categories_data"]
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"Missing required column: {col}")
    
    # Fill missing values and convert to string
    df["category_description"] = df["category_description"].fillna("").astype(str)
    df["categories_data"] = df["categories_data"].fillna("").astype(str)
    
    # Clean and split categories
    df["category_description"] = df["category_description"].str.replace("\n", "", regex=True).str.strip().str.split(", ")
    df["categories_data"] = df["categories_data"].str.replace("\n", "", regex=True).str.replace("%", "").str.strip().str.split(", ")
    
    # Ensure lists have matching lengths
    def adjust_lengths(row):
        names, values = row["category_description"], row["categories_data"]
        if isinstance(names, list) and isinstance(values, list):
            min_len = min(len(names), len(values))
            return names[:min_len], values[:min_len]
        return names, values
    
    df[["category_description", "categories_data"]] = df.apply(adjust_lengths, axis=1, result_type="expand")
    
    # Explode the DataFrame to make each category a row
    df_rows = df.explode(["category_description", "categories_data"])
    
    # Convert category data to numeric values
    df_rows["categories_data"] = pd.to_numeric(df_rows["categories_data"], errors="coerce")
    
    # Aggregate duplicates by taking the mean
    df_rows = df_rows.groupby(["book_name", "category_description"], as_index=False).mean(numeric_only=True)
    
    # Pivot the categories into columns (remove author_name)
    df_pivot = df_rows.pivot(index='book_name', columns="category_description", values="categories_data")
    
    # Reset index to bring "book_name" back as a column
    df_pivot.reset_index(inplace=True)
    
    # Fill NaN values with 0
    df_pivot.fillna(0, inplace=True)
    
    # Save to CSV (without author_name)
    df_pivot.to_csv(output_csv, index=False)
    
    return df_pivot

# Example of how to call the function
V = clean_csv_data("output_new.csv")
df = V
df


category_description,book_name,Accurate,Accurate.1,Action,Actionable,Adult themes,Adult themes.1,Advanced,Angry,Anthology,...,Meandering,Medium,Medium paced,Mystery,Neutral,Recommend,Romantic,Slow paced,and figures,diagrams
0,A Christmas carol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Adventures of Huckleberry Finn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Adventures of Sherlock Holmes,0.0,33.0,12.0,33.0,0.0,14.0,0.0,12.0,0.0,...,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,50.0,16.0
3,Animal Farm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Charlie and the Chocolate Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Charlie and the Great Glass Elevator,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Cinquante nuances de Grey,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Daddy-Long-Legs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,El Principito / The Little Prince,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Emma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
import pandas as pd

def clean_csv_data(input_csv, output_csv="cleaned_data.csv"):
    '''
    Args:
        input_csv (str): Path to the uncleaned CSV file.
        output_csv (str): Path to save the cleaned data (default: "cleaned_data.csv").
    Returns:
        pd.DataFrame: A cleaned DataFrame with pivoted categories.
    '''
    df = pd.read_csv(input_csv)
    
    # Ensure required columns exist
    required_columns = ["book_name", "author_name", "category_name", "category_description", "categories_data"]
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"Missing required column: {col}")
    
    # Fill missing values and convert to string
    df["category_description"] = df["category_description"].fillna("").astype(str)
    df["categories_data"] = df["categories_data"].fillna("").astype(str)
    
    # Clean and split categories
    df["category_description"] = df["category_description"].str.replace("\n", "", regex=True).str.strip().str.split(", ")
    df["categories_data"] = df["categories_data"].str.replace("\n", "", regex=True).str.replace("%", "").str.strip().str.split(", ")
    
    # Ensure lists have matching lengths
    def adjust_lengths(row):
        names, values = row["category_description"], row["categories_data"]
        if isinstance(names, list) and isinstance(values, list):
            min_len = min(len(names), len(values))
            return names[:min_len], values[:min_len]
        return names, values
    
    df[["category_description", "categories_data"]] = df.apply(adjust_lengths, axis=1, result_type="expand")
    
    # Explode the DataFrame to make each category a row
    df_rows = df.explode(["category_description", "categories_data"])
    
    # Convert category data to numeric values
    df_rows["categories_data"] = pd.to_numeric(df_rows["categories_data"], errors="coerce")
    
    # Aggregate duplicates by taking the mean
    df_rows = df_rows.groupby(["book_name", "category_description"], as_index=False).mean(numeric_only=True)
    
    # Pivot the categories into columns (remove author_name)
    df_pivot = df_rows.pivot(index='book_name', columns="category_description", values="categories_data")
    
    # Reset index to bring "book_name" back as a column
    df_pivot.reset_index(inplace=True)
    
    # Fill NaN values with 0
    df_pivot.fillna(0, inplace=True)
    
    # Save to CSV (without author_name)
    df_pivot.to_csv(output_csv, index=False)
    
    return df_pivot

# Example of how to call the function
V = clean_csv_data("output_new.csv")
df = V
df.set_index('book_name', inplace = True)
df

category_description,Accurate,Accurate,Action,Actionable,Adult themes,Adult themes,Advanced,Angry,Anthology,Based on a true story,...,Meandering,Medium,Medium paced,Mystery,Neutral,Recommend,Romantic,Slow paced,and figures,diagrams
book_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Christmas carol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adventures of Huckleberry Finn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adventures of Sherlock Holmes,0.0,33.0,12.0,33.0,0.0,14.0,0.0,12.0,0.0,12.0,...,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,50.0,16.0
Animal Farm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Charlie and the Chocolate Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Charlie and the Great Glass Elevator,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cinquante nuances de Grey,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Daddy-Long-Legs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
El Principito / The Little Prince,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Emma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## RANK:

(66,
 array([[0.00000000e+00, 2.58698727e-13, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 3.30000000e+01, 1.20000000e+01, ...,
         2.89596834e-08, 5.00000000e+01, 1.60000000e+01],
        ...,
        [0.00000000e+00, 1.26138175e-05, 2.08475659e-07, ...,
         1.15349037e-04, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         6.54777798e-15, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 4.01721856e-12, ...,
         1.00000000e+02, 0.00000000e+00, 0.00000000e+00]]))


In [75]:
topics = []
for i in range(66):
    topics.append(f'Topic {i+1}')

In [76]:
# here we're able to define the model using the NMF package which we'll use to decompose W and H
model = NMF(n_components=66, init='random', random_state=0, max_iter = 3000) 

In [77]:
# creating W from V by taking in the rows of V and the columns from topics
W = model.fit_transform(V, 500)

# creating the other matrix H based on the topics and columns of V
H = pd.DataFrame(model.components_)

# Renaming the indices of H to reflect the topics
H.index = topics

# creating a new numpy array W_df to reflect the dataframe, not the model matrix
# !!!!!!!!!! I rounded to two decimal places just so I can see the data better right now !!!!!!!!!!
# also ensures the columns of W match the rows of H
W_df = pd.DataFrame(np.round(W, 2), columns = H.index)
W_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 57,Topic 58,Topic 59,Topic 60,Topic 61,Topic 62,Topic 63,Topic 64,Topic 65,Topic 66
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.12,0.0,0.0,2.94,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# basically repeating everything from W but respectively for H
H.index = topics
H.columns = V.columns
H_df = pd.DataFrame(np.round(H,2))
H_df

category_description,Accurate,Accurate.1,Action,Actionable,Adult themes,Adult themes.1,Advanced,Angry,Anthology,Based on a true story,...,Meandering,Medium,Medium paced,Mystery,Neutral,Recommend,Romantic,Slow paced,and figures,diagrams
Topic 1,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.87,0.0,0.0,0.0,0.0,0.00,0.0,0.0
Topic 2,0.0,10.07,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0
Topic 3,0.0,0.00,1.79,0.0,0.0,1.44,8.97,0.00,0.00,0.00,...,0.0,0.0,0.00,0.0,0.0,0.0,0.0,4.48,0.0,0.0
Topic 4,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0
Topic 5,0.0,8.02,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,6.15,0.0,0.0,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Topic 62,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.37,0.25,0.39,...,0.0,0.0,0.00,0.0,0.0,0.0,0.0,2.19,0.0,0.0
Topic 63,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0
Topic 64,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.00,5.1,0.0,0.0,0.0,0.00,0.0,0.0
Topic 65,0.0,2.08,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.00,0.0,0.0


In [79]:
# actually applying NMF again through matrix-multiplication to reconstruct the approximation of V
new_V = W @ H

# rounding decimals again to 5 for easier reading
new_V = pd.DataFrame(np.round(new_V,5), columns=V.columns)
new_V.index = V.index
new_V.index.name = "Title"
new_V

category_description,Accurate,Accurate,Action,Actionable,Adult themes,Adult themes,Advanced,Angry,Anthology,Based on a true story,...,Meandering,Medium,Medium paced,Mystery,Neutral,Recommend,Romantic,Slow paced,and figures,diagrams
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Christmas carol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adventures of Huckleberry Finn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adventures of Sherlock Holmes,0.0,33.0,12.0,33.0,0.0,14.0,0.0,12.0,0.0,12.0,...,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,50.0,16.0
Animal Farm,0.0,0.0013,0.0,5e-05,0.0,0.0,0.00096,0.0,0.0,0.0,...,0.0,0.0,0.1044,0.0,0.0,5e-05,0.0,0.00261,0.0,0.0
Charlie and the Chocolate Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.00026,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,2e-05,0.0,0.0
Charlie and the Great Glass Elevator,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cinquante nuances de Grey,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Daddy-Long-Legs,0.0,0.0,0.0,0.0,0.0,0.0,1e-05,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,1e-05,0.0,0.0,0.0,0.0,0.0
El Principito / The Little Prince,0.0,0.00029,0.0,0.0,0.0,0.0,0.00034,0.0,0.0,0.0,...,50.0,0.0,0.00022,0.0,0.0,0.0,0.0,0.00058,0.0,0.0
Emma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
def top_book_val(book, df):

    # check if the input is in new_V
    if book not in df.index:
        return f'{book} cannot be found in the DataFrame'

    # grabs all the cells for the input book
    book_scores = df.loc[book]

    # sets the largeset value to the first cell
    top_val = book_scores[0]

    # iterate through all the cells to find the largest value and replace top_val
    for i in book_scores:
        if i > top_val:
            top_val = i
    return top_val
ex = top_book_val('Animal Farm', new_V)
ex

100.0

In [81]:
def find_column(book, df, top_val):
    in_col = [column for column in df.columns if top_val in df[column].values]
    for i in in_col:  # `i` is already a column name
        for j in df.columns:  # Looping through all columns
            for k in df.index:  # Looping through all rows
                # Check if the value matches in the DataFrame
                if df.loc[k,j] == top_val:
                    our_genre = i  # Since `i` is already a column name
                    return our_genre  # Return the genre
    return None 
find_column('Animal Farm', new_V, ex)

'       Accurate'

In [82]:
def rec_values(book, df, top_val):
    
    # Initialize a list to store the top 5 values
    recs_vals = []

    # Get the row corresponding to the book
    book_row = df.loc[book]
    book_col = find_column(book, df, top_val)
    col_index = df.columns.get_loc(book_col)

    # Iterate through each value in the row
    # for value in df[book_col]:
    
    for value in range(len(df)):
        if df.iloc[value, col_index] != top_val:
            if len(recs_vals) < 5:
                # Add the value to the list if it has less than 5 elements
                recs_vals.append(df.iloc[value, col_index])
                recs_vals.sort()  # Ensure the list is sorted (smallest to largest)
            elif df.iloc[value, col_index] > recs_vals[0]:
                # Replace the smallest value if the current value is larger
                recs_vals[0] = df.iloc[value, col_index]
                recs_vals.sort()  # Keep the list sorted

    return recs_vals
r = rec_values('Animal Farm', new_V, ex)
r

[0.0, 0.0, 0.0, 0.0, 4e-05]

In [83]:
book_col = find_column('Animal Farm', new_V, ex)
def recommendations(book, df, recs_vals):
    rows = []

    for i in df.index:
        if df.loc[i, book_col] in recs_vals:
            rows.append(i)
    return rows
recommendations('Animal Farm', new_V, r)

['A Christmas carol',
 'Adventures of Huckleberry Finn',
 'Adventures of Sherlock Holmes',
 'Animal Farm',
 'Charlie and the Chocolate Factory',
 'Charlie and the Great Glass Elevator',
 'Cinquante nuances de Grey',
 'Daddy-Long-Legs',
 'El Principito / The Little Prince',
 'Emma',
 "Ender's Game",
 'Fahrenheit 451',
 'Frankenstein',
 'It',
 'James and the Giant Peach',
 'Koralina',
 'Lolita',
 'Matilda',
 'Momo',
 'New Moon',
 'Nineteen eighty-four',
 'Northern Lights',
 'Of Human Bondage',
 'Pet Sematary',
 'Pippi Longstocking',
 'Pollyanna',
 'Prince Caspian',
 'Ten Little Indians',
 'The Awakening',
 'The Case-Book of Sherlock Holmes',
 'The Dark Tower',
 'The Fellowship of the Ring',
 'The Great Gatsby',
 'The Green Mile',
 "The Handmaid's Tale",
 "The Hitch Hiker's Guide to the Galaxy",
 'The Hobbit',
 'The Kama Sutra of Vatsyayana',
 'The King in Yellow',
 'The Last Man',
 'The Lion, the Witch and the Wardrobe',
 "The Magician's Nephew",
 'The Pickwick papers',
 'The Prince and 