In [1]:
#imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt

import re, ast
from scipy.sparse import csr_matrix, hstack, vstack, issparse

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('../data/prep.csv')
print(f" Loaded! Shape: {df.shape}")
print(f" Columns: {df.columns.tolist()}")
print(f" Unique books: {df['parent_asin'].nunique():,}")
print(f" Unique users: {df['user_id'].nunique():,}")

 Loaded! Shape: (4624615, 5)
 Columns: ['parent_asin', 'price', 'rating', 'user_id', 'merged_text']
 Unique books: 137,249
 Unique users: 2,766,656


In [6]:
def extract_unique_items(df):

    print("Extracting unique items and building metadata table...\n")

    # Validate required columns
    required_columns = ['parent_asin', 'price', 'rating', 'merged_text']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    # Aggregate at the item level
    item_df = (
        df.groupby('parent_asin')
        .agg(
            price = ('price', 'first'),
            avg_rating = ('rating', 'mean'),
            num_ratings = ('rating', 'count'),
            text = ('merged_text', 'first'),
        )
    ).reset_index()

    # Display summary and preview
    print(f"Extracted {item_df.shape[0]:,} unique items.")
    print("Preview of item metadata:")
    print(item_df.head())

    return item_df

In [7]:
item_df = extract_unique_items(df)

Extracting unique items and building metadata table...

Extracted 137,249 unique items.
Preview of item metadata:
  parent_asin  price  avg_rating  num_ratings  \
0  0007922582     51    2.000000            1   
1  0008288194     14    3.000000            1   
2  0028179714     15    4.666667            3   
3  0060501960      8    4.470588           17   
4  0063052164     20    5.000000            4   

                                                text  
0  The Sneetches and Other Stories    Too small w...  
1  The Creativity Code   Video Games    PC    Gam...  
2  The Autobiography of Miss Jane Pittman and Rel...  
3  Presidents  Day   Video Games    PC    Games  ...  
4  Stranger Planet AUTOGRAPHED   SIGNED BOOK   Vi...  


In [8]:
def build_item_representation(df, max_features=10000, min_df=2, ngram_range=(1,2)):

    print("Building hybrid item representations (text + numeric)...\n")

    # TF-IDF vectorization for textual metadata
    tfidf = TfidfVectorizer(max_features=max_features, min_df=min_df, ngram_range=ngram_range, stop_words='english')
    tfidf_matrix = tfidf.fit_transform(item_df['text'])

    # Select and normalize numeric features
    numeric_features = ['price', 'avg_rating', 'num_ratings']
    scaler = StandardScaler()

    # Convert to numeric safely
    numeric_data = df[numeric_features].apply(pd.to_numeric, errors="coerce").fillna(0)
    numeric_scaled = scaler.fit_transform(numeric_data)
    print(f"Numeric features scaled (columns: {numeric_features})")

    # Convert to sparse format for concatenation
    numeric_sparse = np.nan_to_num(numeric_scaled)

    # Concatenate text and numeric representations
    hybrid_matrix = hstack([tfidf_matrix, numeric_sparse])
    print(f"Final hybrid matrix shape: {hybrid_matrix.shape}")

    # Maintain item lookup for interpretation
    tfidf_index = df["parent_asin"].reset_index(drop=True)
    print("Created lookup table linking vectors to book titles.\n")

    return hybrid_matrix, tfidf, tfidf_index