In [1]:
%pip install -q -r requirements.txt --no-cache-dir --upgrade --quiet

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd

# Re-read the CSV file, skipping any problematic rows
BOOKS = pd.read_csv("/Users/nathanielani/library_dataset/books.csv", encoding='ISO-8859-1', sep=';', quotechar='"', on_bad_lines='skip')
RATINGS = pd.read_csv("/Users/nathanielani/library_dataset/ratings.csv", encoding='ISO-8859-1', sep=';', quotechar='"', on_bad_lines='skip')
USERS = pd.read_csv("/Users/nathanielani/library_dataset/users.csv", encoding='ISO-8859-1', sep=';', quotechar='"', on_bad_lines='skip')

# Print the column names to check if the data is loaded correctly
print("Books:", BOOKS.columns)
print("Ratings:", RATINGS.columns)
print("Users:", USERS.columns)


In [None]:
# Print the first few rows of the BOOKS DataFrame to inspect the data
print(BOOKS.head())


In [None]:
# Clean column names by stripping any extra spaces or invisible characters
BOOKS.columns = BOOKS.columns.str.strip()


In [None]:
# Check for any hidden characters in column names
for col in BOOKS.columns:
    print(f"'{col}'")


In [None]:
# Get unique values, excluding NaN values
isbns = BOOKS["ISBN"].dropna().unique()
print("Unique ISBNs:", len(isbns))  # Expected: 271379

book_titles = BOOKS["Book-Title"].dropna().unique()
print("Unique Book Titles:", len(book_titles))  # Expected: 242135

book_authors = BOOKS["Book-Author"].dropna().unique()
print("Unique Book Authors:", len(book_authors))  # Expected: 102023

year_of_publications = BOOKS["Year-Of-Publication"].dropna().unique()
print("Unique Year of Publications:", len(year_of_publications))  # Expected: 202

publisher = BOOKS["Publisher"].dropna().unique()
print("Unique Publishers:", len(publisher))  # Expected: 16807

image_url_m = BOOKS["Image-URL-M"].dropna().unique()
print("Unique Image URLs (M):", len(image_url_m))  # Expected: 271379

image_url_l = BOOKS["Image-URL-L"].dropna().unique()
print("Unique Image URLs (L):", len(image_url_l))  # Expected: 271379


In [None]:
import pandas as pd
import os

# Define paths
INPUT_PATH = "/Users/nathanielani/library_dataset/"
OUTPUT_PATH = "/Users/nathanielani/library_dataset/processed/"

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Load Books.csv with a specified encoding and skip bad lines
books_df = pd.read_csv(os.path.join(INPUT_PATH, "books.csv"), encoding='ISO-8859-1', sep=';', quotechar='"', on_bad_lines='skip', low_memory=False)

# Display columns to verify structure
print("Columns in Books.csv:", books_df.columns)

# Rename columns for PostgreSQL compatibility
rename_dict = {
    "ISBN": "isbn",
    "Book-Title": "book_title",
    "Book-Author": "book_author",
    "Year-Of-Publication": "year_of_publication",
    "Publisher": "publisher",
    "Image-URL-S": "image_url_s",
    "Image-URL-M": "image_url_m",
    "Image-URL-L": "image_url_l",
}

books_df.rename(columns=rename_dict, inplace=True)

# Remove duplicates (based on ISBN since it's unique for books)
books_df.drop_duplicates(subset=["isbn"], inplace=True)

# Save cleaned dataset
cleaned_file_path = os.path.join(OUTPUT_PATH, "clean_books.csv")
books_df.to_csv(cleaned_file_path, index=False)

print(f"✅ Dataset has been cleaned and saved to: {cleaned_file_path}")


In [None]:
user_id = RATINGS["User-ID"].unique()
print("Unique User IDs:", len(user_id)) # 105283

isbn = RATINGS["ISBN"].unique()
print("Unique ISBNs:", len(isbn)) # 340556

book_ratings = RATINGS["Book-Rating"].unique()
print("Unique Book Ratings:", len(book_ratings)) # 11

In [None]:
import pandas as pd
import os

# Define paths
INPUT_PATH = "/Users/nathanielani/library_dataset/"
OUTPUT_PATH = "/Users/nathanielani/library_dataset/processed/"

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Load Ratings.csv with a specified encoding and skip bad lines
ratings_df = pd.read_csv(os.path.join(INPUT_PATH, "Ratings.csv"), encoding='ISO-8859-1', sep=';', quotechar='"', on_bad_lines='skip', low_memory=False)

# Display columns to verify structure
print("Columns in RATINGS.csv:", ratings_df.columns)

# Rename columns for PostgreSQL compatibility
rename_dict = {
    "User-ID": "user_id",
    "ISBN": "isbn",
    "Book-Rating": "book_rating",
}

ratings_df.rename(columns=rename_dict, inplace=True)

# Remove duplicates (based on ISBN since it's unique for books)
ratings_df.drop_duplicates(subset=["isbn"], inplace=True)

# Save cleaned dataset
cleaned_file_path = os.path.join(OUTPUT_PATH, "clean_ratings.csv")
ratings_df.to_csv(cleaned_file_path, index=False)

print(f"✅ Dataset has been cleaned and saved to: {cleaned_file_path}")


In [None]:
user_id = USERS["User-ID"].unique()
print("Unique User IDs:", len(user_id)) # 278858

location = USERS["Location"].unique()
print("Unique Locations:", len(location)) # 57339

age = USERS["Age"].unique()
print("Unique Ages:", len(age)) # 166

In [None]:
import pandas as pd
import os

# Define paths
INPUT_PATH = "/Users/nathanielani/library_dataset/"
OUTPUT_PATH = "/Users/nathanielani/library_dataset/processed/"

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Load Users.csv with correct delimiter and quote handling
users_df = pd.read_csv(os.path.join(INPUT_PATH, "Users.csv"), encoding='ISO-8859-1', sep=';', quotechar='"', low_memory=False, on_bad_lines='skip')

# Display columns to verify structure before renaming
print("Columns in Users.csv:", users_df.columns)

# Rename columns for PostgreSQL compatibility
rename_dict = {
    "User-ID": "user_id",
    "Location": "location",
    "Age": "age",
}

# Apply renaming
users_df.rename(columns=rename_dict, inplace=True)

# Display columns after renaming
print("Columns after renaming:", users_df.columns)

# Remove duplicates (based on user_id)
users_df.drop_duplicates(subset=["user_id"], inplace=True)

# Save cleaned dataset
cleaned_file_path = os.path.join(OUTPUT_PATH, "clean_users.csv")
users_df.to_csv(cleaned_file_path, index=False)

print(f"✅ Dataset has been cleaned and saved to: {cleaned_file_path}")


In [None]:
import pandas as pd
import os

# Define the directory containing processed files
PROCESSED_PATH = "/Users/nathanielani/library_dataset/processed/"

# List all files in the processed directory
processed_files = [f for f in os.listdir(PROCESSED_PATH) if f.endswith(".csv")]

# Check columns for each CSV file
for file in processed_files:
    file_path = os.path.join(PROCESSED_PATH, file)
    df = pd.read_csv(file_path, nrows=5)  # Read first 5 rows for quick check
    print(f"\n📂 {file} - Columns:")
    print(df.columns.tolist())  # Print column names

In [None]:
import pandas as pd
import os

# Define paths (SQLAlchemy)
INPUT_PATH = "/Users/nathanielani/library_dataset/processed/"
OUTPUT_PATH = "/Users/nathanielani/library_dataset/sql-files/"


# Define Table Names

csv_files = {"books": "clean_books.csv", "ratings": "clean_ratings.csv", "users": "clean_users.csv"}

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

for table_name, csv_file in csv_files.items():
    # Load cleaned CSV file
    df = pd.read_csv(os.path.join(INPUT_PATH, csv_file))

    # Define SQL file path
    sql_file_path = os.path.join(OUTPUT_PATH, f"{table_name}.sql")

    # Generate SQL CREATE TABLE statement
    create_table_sql = f"CREATE TABLE IF NOT EXISTS {table_name} (\n"

    for column in df.columns:
        # Get column data type
        data_type = df[column].dtype

        # Define SQL column definition
        if "object" in str(data_type):
            create_table_sql += f"    {column} VARCHAR,\n"
        elif "int" in str(data_type):
            create_table_sql += f"    {column} INTEGER,\n"
        elif "float" in str(data_type):
            create_table_sql += f"    {column} FLOAT,\n"

    # Finalize CREATE TABLE statement
    create_table_sql = create_table_sql.strip().strip(",") + "\n);"

    # Save SQL file
    with open(sql_file_path, "w") as f:
        f.write(create_table_sql)

    print(f"✅ {table_name} - SQL file has been created: {sql_file_path}")

In [None]:
import pandas as pd
import os

# Define input/output paths
INPUT_PATH = "/Users/nathanielani/library_dataset/processed/"
OUTPUT_PATH = "/Users/nathanielani/library_dataset/sql-files/"

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Define CSV files and corresponding table names
csv_files = {
    "books": "clean_books.csv",
    "ratings": "clean_ratings.csv",
    "users": "clean_users.csv",
}


# Function to map Pandas data types to SQL data types
def map_dtype(dtype):
    if "int" in str(dtype):
        return "BIGINT"  # Use BIGINT for large user_id values
    elif "float" in str(dtype):
        return "FLOAT"
    elif "object" in str(dtype):
        return "TEXT"  # TEXT is more flexible than VARCHAR
    else:
        return "TEXT"  # Default fallback for unknown types


# Iterate over CSV files and generate SQL schema
for table_name, csv_file in csv_files.items():
    df = pd.read_csv(os.path.join(INPUT_PATH, csv_file))

    # Define SQL file path
    sql_file_path = os.path.join(OUTPUT_PATH, f"{table_name}.sql")

    # Generate SQL CREATE TABLE statement
    create_table_sql = f"DROP TABLE IF EXISTS {table_name};\n"
    create_table_sql += f"CREATE TABLE {table_name} (\n"

    # Identify primary keys based on table
    primary_keys = {"books": "isbn", "users": "user_id", "ratings": "user_id, isbn"}

    # Generate column definitions
    column_definitions = []
    for column in df.columns:
        sql_type = map_dtype(df[column].dtype)
        column_definitions.append(f"    {column} {sql_type}")

    # Add Primary Key constraint
    create_table_sql += ",\n".join(column_definitions)
    if table_name in primary_keys:
        create_table_sql += f",\n    PRIMARY KEY ({primary_keys[table_name]})"

    # Finalize SQL schema
    create_table_sql += "\n);\n"

    # Save SQL file
    with open(sql_file_path, "w") as f:
        f.write(create_table_sql)

    print(f"✅ {table_name}.sql file has been created: {sql_file_path}")

print("\n🚀 All SQL schema files have been generated successfully!")

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

image_path = "/Users/timeless/library_dataset/recsys_taxonomy2.png"
img = Image.open(image_path)

# Display the image
plt.imshow(img)
plt.axis("off")  # Hide axes
plt.show()

In [9]:
import hashlib
import os

class PasswordHasher:
    def __init__(self):
        self.salt = os.urandom(32)  # Generate a random 32-byte salt

    def hash_password(self, password):
        # Use SHA-256 hashing algorithm
        key = hashlib.pbkdf2_hmac(
            "sha256",  # The hash digest algorithm for HMAC
            password.encode("utf-8"),  # Convert the password to bytes
            self.salt,  # Provide the salt
            100000,  # It is recommended to use at least 100,000 iterations of SHA-256
        )
        return key

    def verify_password(self, hashed_password, provided_password):
        # Hash the provided password
        new_hashed_password = self.hash_password(provided_password)

        # Compare the hashed passwords
        return new_hashed_password == hashed_password

    def get_user_input(self):
        # Get user input for password
        password = input("Enter your password: ")

        print("Plain password entered:", password)

        # Hash the password
        hashed_password = self.hash_password(password)

        # Verify the password
        if self.verify_password(hashed_password, password):
            print("Password verified successfully!")
        else:
            print("Password verification failed!")

        return hashed_password

In [None]:
hasher = PasswordHasher()

# Get user input and hash the password
hashed_password = hasher.get_user_input()

print("Hashed Password:", hashed_password)

In [None]:
hasher = PasswordHasher()

# Define a password
password = "mysecretpassword"

# Hash the password
hashed_password = hasher.hash_password(password)
print(f"Hashed Password: {hashed_password}")

verifify_password = hasher.verify_password(hashed_password, password)
print(f"Password Verified: {verifify_password}")

In [5]:
# Generate a strong password froma password library
import secrets
import string

def generate_password(length=12):
    alphabet = string.ascii_letters + string.digits + string.punctuation
    password = "".join(secrets.choice(alphabet) for i in range(length))
    return password

# Generate a strong password
strong_password = generate_password()
print(f"Strong Password: {strong_password}")

Strong Password: {Xa'V@#|Pi%9
