# Goodreads Fantasy & Paranormal Dataset - Advanced EDA

This notebook implements Phase 1 of exploratory data analysis and generates innovative features for the Goodreads Fantasy & Paranormal dataset.

In [3]:
# Import necessary libraries
import os
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import gzip
from datetime import datetime
from tqdm import tqdm
from collections import Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)


%matplotlib inline
plt.rcParams.update(
    {
        "text.usetex": True,
        "font.family": "serif",
        "font.serif": ["Computer Modern Roman"],
        "text.latex.preamble": r"\usepackage{amsmath} \usepackage{amssymb}",
        "axes.labelsize": 12,
        "font.size": 11,
        "legend.fontsize": 10,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
    }
)

## 1. Data Loading Functions

Functions to load and process the Goodreads dataset files.

In [4]:
def load_json_gz(file_path, data_type):
    """Load data from a gzipped JSON file.
    
    Args:
        file_path (str): Path to the gzipped JSON file.
        data_type (str): Type of data being loaded (for logging).
        
    Returns:
        pd.DataFrame: DataFrame containing the loaded data.
    """
    print(f"Loading {data_type} from {file_path}")
    
    # Read gzipped JSON file line by line
    data = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, desc=f"Loading {data_type}"):
            try:
                item = json.loads(line)
                data.append(item)
            except json.JSONDecodeError:
                print(f"Warning: Error decoding JSON line in {file_path}")
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    print(f"Loaded {len(df)} {data_type}")
    return df

In [5]:
# Define your data paths here
DATA_DIR = "data/interim/goodreads/"
INTERACTIONS_PATH = os.path.join(DATA_DIR, "goodreads_interactions_fantasy_paranormal.json.gz")
BOOKS_PATH = os.path.join(DATA_DIR, "goodreads_books_fantasy_paranormal.json.gz")
REVIEWS_PATH = os.path.join(DATA_DIR, "goodreads_reviews_fantasy_paranormal.json.gz")

# Create output directory for results
OUTPUT_DIR = "./eda_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "plots"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "features"), exist_ok=True)

## 2. Load the Data

Load the Goodreads dataset files. You can adjust the sample size or comment out this cell if you've already loaded the data.

In [6]:
# Load sample data (adjust sample sizes as needed)
# For initial exploration, we'll use smaller samples
# Comment out if you've already loaded the data

# Uncomment and run this cell to load samples of the data
# interactions_df = load_json_gz(INTERACTIONS_PATH, "interactions")[:50000]
# books_df = load_json_gz(BOOKS_PATH, "books")[:10000]
# reviews_df = load_json_gz(REVIEWS_PATH, "reviews")[:50000]

## 3. Dataset Integrity Analysis

Analyze dataset integrity, including missing values and inconsistencies.

In [7]:
def dataset_integrity_analysis(books_df, reviews_df=None, interactions_df=None):
    """Analyze dataset integrity, including missing values and inconsistencies."""
    print("Performing dataset integrity analysis")

    results = {}

    # Check for missing values
    books_missing = books_df.isnull().sum() / len(books_df) * 100
    results["books_missing_percentages"] = books_missing.to_dict()

    # Create a bar plot of missing values in books
    plt.figure(figsize=(12, 8))
    books_missing.sort_values(ascending=False).plot(kind="bar")
    plt.title(r"\textbf{Missing Values in Books Dataset (\%)}")
    plt.xlabel(r"\textbf{Column}")
    plt.ylabel(r"\textbf{Percentage Missing}")
    plt.xticks(rotation=90)
    plt.tight_layout()

    if reviews_df is not None:
        reviews_missing = reviews_df.isnull().sum() / len(reviews_df) * 100
        results["reviews_missing_percentages"] = reviews_missing.to_dict()

        # Create a bar plot of missing values in reviews
        plt.figure(figsize=(12, 6))
        reviews_missing.sort_values(ascending=False).plot(kind="bar")
        plt.title(r"\textbf{Missing Values in Reviews Dataset (\%)}")
        plt.xlabel(r"\textbf{Column}")
        plt.ylabel(r"\textbf{Percentage Missing}")
        plt.xticks(rotation=90)
        plt.tight_layout()

    if interactions_df is not None:
        interactions_missing = (
            interactions_df.isnull().sum() / len(interactions_df) * 100
        )
        results["interactions_missing_percentages"] = interactions_missing.to_dict()

        # Create a bar plot of missing values in interactions
        plt.figure(figsize=(12, 6))
        interactions_missing.sort_values(ascending=False).plot(kind="bar")
        plt.title(r"\textbf{Missing Values in Interactions Dataset (\%)}")
        plt.xlabel(r"\textbf{Column}")
        plt.ylabel(r"\textbf{Percentage Missing}")
        plt.xticks(rotation=90)
        plt.tight_layout()

    # Rest of function remains the same
    # Identify inconsistencies in book metadata
    results["duplicate_books"] = books_df.duplicated(subset=["book_id"]).sum()
    results["books_with_missing_titles"] = books_df[books_df["title"].isnull()].shape[0]
    results["books_with_missing_authors"] = books_df[
        books_df["authors"].isnull()
    ].shape[0]

    # Validate timestamps if available
    if reviews_df is not None and "date_added" in reviews_df.columns:
        reviews_df["date_added"] = pd.to_datetime(
            reviews_df["date_added"], errors="coerce"
        )
        current_date = datetime.now()
        results["invalid_timestamps"] = reviews_df[
            reviews_df["date_added"] > current_date
        ].shape[0]
        results["reviews_without_timestamps"] = reviews_df[
            reviews_df["date_added"].isnull()
        ].shape[0]

    # Examine rating distributions
    if reviews_df is not None and "rating" in reviews_df.columns:
        rating_distribution = (
            reviews_df["rating"].value_counts(normalize=True).sort_index()
        )
        results["rating_distribution"] = rating_distribution.to_dict()

        # Check for anomalies in ratings
        results["invalid_ratings"] = reviews_df[
            ~reviews_df["rating"].between(1, 5)
        ].shape[0]

        # Plot rating distribution
        plt.figure(figsize=(10, 6))
        sns.countplot(x="rating", data=reviews_df)
        plt.title(r"\textbf{Rating Distribution}")
        plt.xlabel(r"$\textbf{Rating} \in [1,5]$")
        plt.ylabel(r"\textbf{Count}")

    # Print summary of findings
    print("\nDataset Integrity Analysis Summary:")
    print(f"Books with duplicate IDs: {results['duplicate_books']}")
    print(f"Books with missing titles: {results['books_with_missing_titles']}")
    print(f"Books with missing authors: {results['books_with_missing_authors']}")

    if "invalid_timestamps" in results:
        print(f"Reviews with invalid timestamps: {results['invalid_timestamps']}")
    if "invalid_ratings" in results:
        print(f"Reviews with invalid ratings: {results['invalid_ratings']}")

    return results


In [8]:
# Run dataset integrity analysis
# integrity_results = dataset_integrity_analysis(books_df, reviews_df, interactions_df)

## 4. User Behavior Profiling

Profile user behavior, including activity levels and rating patterns.

In [9]:
def dataset_integrity_analysis(books_df, reviews_df=None, interactions_df=None):
    """Analyze dataset integrity, including missing values and inconsistencies."""
    print("Performing dataset integrity analysis")

    results = {}

    # Check for missing values
    books_missing = books_df.isnull().sum() / len(books_df) * 100
    results["books_missing_percentages"] = books_missing.to_dict()

    # Create a bar plot of missing values in books
    plt.figure(figsize=(12, 8))
    books_missing.sort_values(ascending=False).plot(kind="bar")
    plt.title(r"\textbf{Missing Values in Books Dataset (\%)}")
    plt.xlabel(r"\textbf{Column}")
    plt.ylabel(r"\textbf{Percentage Missing}")
    plt.xticks(rotation=90)
    plt.tight_layout()

    if reviews_df is not None:
        reviews_missing = reviews_df.isnull().sum() / len(reviews_df) * 100
        results["reviews_missing_percentages"] = reviews_missing.to_dict()

        # Create a bar plot of missing values in reviews
        plt.figure(figsize=(12, 6))
        reviews_missing.sort_values(ascending=False).plot(kind="bar")
        plt.title(r"\textbf{Missing Values in Reviews Dataset (\%)}")
        plt.xlabel(r"\textbf{Column}")
        plt.ylabel(r"\textbf{Percentage Missing}")
        plt.xticks(rotation=90)
        plt.tight_layout()

    if interactions_df is not None:
        interactions_missing = (
            interactions_df.isnull().sum() / len(interactions_df) * 100
        )
        results["interactions_missing_percentages"] = interactions_missing.to_dict()

        # Create a bar plot of missing values in interactions
        plt.figure(figsize=(12, 6))
        interactions_missing.sort_values(ascending=False).plot(kind="bar")
        plt.title(r"\textbf{Missing Values in Interactions Dataset (\%)}")
        plt.xlabel(r"\textbf{Column}")
        plt.ylabel(r"\textbf{Percentage Missing}")
        plt.xticks(rotation=90)
        plt.tight_layout()

    # Rest of function remains the same
    # Identify inconsistencies in book metadata
    results["duplicate_books"] = books_df.duplicated(subset=["book_id"]).sum()
    results["books_with_missing_titles"] = books_df[books_df["title"].isnull()].shape[0]
    results["books_with_missing_authors"] = books_df[
        books_df["authors"].isnull()
    ].shape[0]

    # Validate timestamps if available
    if reviews_df is not None and "date_added" in reviews_df.columns:
        reviews_df["date_added"] = pd.to_datetime(
            reviews_df["date_added"], errors="coerce"
        )
        current_date = datetime.now()
        results["invalid_timestamps"] = reviews_df[
            reviews_df["date_added"] > current_date
        ].shape[0]
        results["reviews_without_timestamps"] = reviews_df[
            reviews_df["date_added"].isnull()
        ].shape[0]

    # Examine rating distributions
    if reviews_df is not None and "rating" in reviews_df.columns:
        rating_distribution = (
            reviews_df["rating"].value_counts(normalize=True).sort_index()
        )
        results["rating_distribution"] = rating_distribution.to_dict()

        # Check for anomalies in ratings
        results["invalid_ratings"] = reviews_df[
            ~reviews_df["rating"].between(1, 5)
        ].shape[0]

        # Plot rating distribution
        plt.figure(figsize=(10, 6))
        sns.countplot(x="rating", data=reviews_df)
        plt.title(r"\textbf{Rating Distribution}")
        plt.xlabel(r"$\textbf{Rating} \in [1,5]$")
        plt.ylabel(r"\textbf{Count}")

    # Print summary of findings
    print("\nDataset Integrity Analysis Summary:")
    print(f"Books with duplicate IDs: {results['duplicate_books']}")
    print(f"Books with missing titles: {results['books_with_missing_titles']}")
    print(f"Books with missing authors: {results['books_with_missing_authors']}")

    if "invalid_timestamps" in results:
        print(f"Reviews with invalid timestamps: {results['invalid_timestamps']}")
    if "invalid_ratings" in results:
        print(f"Reviews with invalid ratings: {results['invalid_ratings']}")

    return results


In [10]:
# Run user behavior profiling
# user_behavior_results = user_behavior_profiling(reviews_df, interactions_df)

## 5. Content Landscape Mapping

Map the content landscape, including genre analysis and author statistics.

In [11]:
def content_landscape_mapping(books_df):
    """Map the content landscape, including genre analysis and author statistics.
    
    Args:
        books_df (pd.DataFrame): Books DataFrame.
        
    Returns:
        dict: Dictionary containing content landscape mapping results.
    """
    print("Performing content landscape mapping")
    
    results = {}
    
    # Extract and analyze genres from popular_shelves
    if 'popular_shelves' in books_df.columns:
        # Ensure popular_shelves is properly processed
        if isinstance(books_df['popular_shelves'].iloc[0], str):
            # Try to parse string as list if it's in string format
            try:
                books_df['popular_shelves'] = books_df['popular_shelves'].apply(
                    lambda x: json.loads(x) if isinstance(x, str) else x
                )
            except:
                print("Could not parse popular_shelves as JSON")
        
        # Extract all shelves
        all_shelves = []
        for shelves in books_df['popular_shelves']:
            if isinstance(shelves, list):
                all_shelves.extend(shelves)
            elif isinstance(shelves, str):
                all_shelves.append(shelves)
        
        # Get top genres
        shelf_counts = Counter(all_shelves)
        top_genres = pd.Series(shelf_counts).sort_values(ascending=False).head(50)
        results['top_genres'] = top_genres.to_dict()
        
        # Plot top genres
        plt.figure(figsize=(15, 10))
        top_genres.head(20).plot(kind='barh')
        plt.title('Top 20 Genres')
        plt.xlabel('Count')
        plt.ylabel('Genre')
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "top_genres.png"))
        plt.show()
        
        # Extract primary genres for each book
        def extract_genres(shelves, top_n=3):
            if not isinstance(shelves, list):
                return []
            return [shelf for shelf in shelves if shelf in top_genres.index][:top_n]
        
        books_df['primary_genres'] = books_df['popular_shelves'].apply(extract_genres)
        
        # Count books per genre
        genre_counts = {}
        for genres in books_df['primary_genres']:
            for genre in genres:
                genre_counts[genre] = genre_counts.get(genre, 0) + 1
        
        results['books_per_genre'] = genre_counts
    
    # Analyze author productivity and popularity
    if 'authors' in books_df.columns:
        # Ensure authors is properly processed
        if isinstance(books_df['authors'].iloc[0], str):
            # Try to parse string as list if it's in string format
            try:
                books_df['authors'] = books_df['authors'].apply(
                    lambda x: json.loads(x) if isinstance(x, str) else x
                )
            except:
                print("Could not parse authors as JSON")
        
        # Explode authors list to get one row per author
        authors_df = books_df.explode('authors')
        
        # Group by author and calculate statistics
        author_stats = authors_df.groupby('authors').agg(
            book_count=('book_id', 'count'),
            avg_rating=('average_rating', 'mean'),
            total_ratings=('ratings_count', 'sum')
        )
        
        # Get top authors by book count
        top_authors_by_books = author_stats.sort_values('book_count', ascending=False).head(20)
        results['top_authors_by_books'] = top_authors_by_books['book_count'].to_dict()
        
        # Get top authors by average rating (with minimum 3 books)
        top_authors_by_rating = author_stats[author_stats['book_count'] >= 3].sort_values('avg_rating', ascending=False).head(20)
        results['top_authors_by_rating'] = top_authors_by_rating['avg_rating'].to_dict()
        
        # Plot top authors by book count
        plt.figure(figsize=(15, 10))
        top_authors_by_books['book_count'].plot(kind='barh')
        plt.title('Top 20 Authors by Book Count')
        plt.xlabel('Number of Books')
        plt.ylabel('Author')
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "top_authors_by_books.png"))
        plt.show()
        
        # Analyze correlation between author productivity and popularity
        author_stats['log_total_ratings'] = np.log1p(author_stats['total_ratings'])
        correlation = author_stats['book_count'].corr(author_stats['log_total_ratings'])
        results['author_productivity_popularity_correlation'] = correlation
        
        # Plot correlation
        plt.figure(figsize=(10, 8))
        sns.scatterplot(x='book_count', y='log_total_ratings', data=author_stats)
        plt.title(f'Author Productivity vs. Popularity (Correlation: {correlation:.2f})')
        plt.xlabel('Number of Books')
        plt.ylabel('Log(Total Ratings)')
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "author_productivity_popularity.png"))
        plt.show()
    
    # Analyze publication years
    if 'publication_year' in books_df.columns:
        # Convert to numeric and handle errors
        books_df['publication_year'] = pd.to_numeric(books_df['publication_year'], errors='coerce')
        
        # Filter out invalid years
        valid_years = books_df[(books_df['publication_year'] >= 1800) & 
                              (books_df['publication_year'] <= datetime.now().year)]
        
        # Count books per year
        year_counts = valid_years['publication_year'].value_counts().sort_index()
        results['books_per_year'] = year_counts.to_dict()
        
        # Plot publication year distribution
        plt.figure(figsize=(15, 6))
        year_counts.plot()
        plt.title('Books Published per Year')
        plt.xlabel('Year')
        plt.ylabel('Number of Books')
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "publication_year_distribution.png"))
        plt.show()
        
        # Analyze ratings by publication year
        if 'average_rating' in books_df.columns:
            year_ratings = valid_years.groupby('publication_year')['average_rating'].mean()
            results['avg_rating_by_year'] = year_ratings.to_dict()
            
            # Plot ratings by year
            plt.figure(figsize=(15, 6))
            year_ratings.plot()
            plt.title('Average Rating by Publication Year')
            plt.xlabel('Year')
            plt.ylabel('Average Rating')
            plt.savefig(os.path.join(OUTPUT_DIR, "plots", "ratings_by_publication_year.png"))
            plt.show()
    
    # Print summary of findings
    print("\nContent Landscape Mapping Summary:")
    
    if 'top_genres' in results:
        print("\nTop 10 Genres:")
        for genre, count in list(results['top_genres'].items())[:10]:
            print(f"  {genre}: {count:,} books")
    
    if 'top_authors_by_books' in results:
        print("\nTop 5 Most Prolific Authors:")
        for author, count in list(results['top_authors_by_books'].items())[:5]:
            print(f"  {author}: {count:,} books")
    
    if 'author_productivity_popularity_correlation' in results:
        print(f"\nAuthor productivity-popularity correlation: {results['author_productivity_popularity_correlation']:.2f}")
    
    return results

In [12]:
# Run content landscape mapping
# content_landscape_results = content_landscape_mapping(books_df)

## 6. Innovative Feature Generation

Generate innovative features for the Goodreads dataset.

### 6.1 Reading Pattern Fingerprints

Create a unique signature of a user's reading habits, capturing temporal patterns.

In [13]:
def generate_reading_pattern_fingerprints(reviews_df, min_interactions=10):
    """Generate reading pattern fingerprints for users.
    
    Args:
        reviews_df (pd.DataFrame): Reviews DataFrame.
        min_interactions (int, optional): Minimum number of interactions per user. Defaults to 10.
        
    Returns:
        dict: Dictionary mapping user IDs to reading pattern fingerprints.
    """
    print("Generating reading pattern fingerprints")
    
    fingerprints = {}
    
    # Ensure date_added is available
    if 'date_added' not in reviews_df.columns:
        print("No date_added column found for generating reading pattern fingerprints")
        return fingerprints
    
    # Convert date_added to datetime
    reviews_df['date_added'] = pd.to_datetime(reviews_df['date_added'], errors='coerce')
    reviews_df = reviews_df.dropna(subset=['date_added'])
    
    # Get active users (with at least min_interactions interactions)
    active_users = reviews_df['user_id'].value_counts()[reviews_df['user_id'].value_counts() >= min_interactions].index
    
    # Generate fingerprints for active users
    for user_id in tqdm(active_users, desc="Generating reading pattern fingerprints"):
        user_df = reviews_df[reviews_df['user_id'] == user_id]
        
        # Time-of-day reading pattern
        user_df['hour'] = user_df['date_added'].dt.hour
        time_pattern = user_df['hour'].value_counts(normalize=True)
        
        # Day-of-week pattern
        user_df['day_of_week'] = user_df['date_added'].dt.dayofweek
        day_pattern = user_df['day_of_week'].value_counts(normalize=True)
        
        # Month pattern
        user_df['month'] = user_df['date_added'].dt.month
        month_pattern = user_df['month'].value_counts(normalize=True)
        
        # Rating distribution pattern (if available)
        rating_pattern = {}
        if 'rating' in user_df.columns:
            rating_pattern = user_df['rating'].value_counts(normalize=True).to_dict()
        
        # Combine into fingerprint
        fingerprints[str(user_id)] = {
            'time_pattern': time_pattern.to_dict(),
            'day_pattern': day_pattern.to_dict(),
            'month_pattern': month_pattern.to_dict(),
            'rating_pattern': rating_pattern
        }
    
    # Visualize a sample fingerprint
    if fingerprints:
        sample_user_id = list(fingerprints.keys())[0]
        sample_fingerprint = fingerprints[sample_user_id]
        
        # Plot time pattern
        plt.figure(figsize=(15, 5))
        plt.subplot(1, 3, 1)
        time_data = pd.Series(sample_fingerprint['time_pattern'])
        time_data.index = pd.to_numeric(time_data.index)
        time_data = time_data.sort_index()
        time_data.plot(kind='bar')
        plt.title('Time of Day Pattern')
        plt.xlabel('Hour')
        plt.ylabel('Proportion')
        
        # Plot day pattern
        plt.subplot(1, 3, 2)
        day_data = pd.Series(sample_fingerprint['day_pattern'])
        day_data.index = pd.to_numeric(day_data.index)
        day_data = day_data.sort_index()
        day_data.plot(kind='bar')
        plt.title('Day of Week Pattern')
        plt.xlabel('Day (0=Monday)')
        plt.ylabel('Proportion')
        
        # Plot month pattern
        plt.subplot(1, 3, 3)
        month_data = pd.Series(sample_fingerprint['month_pattern'])
        month_data.index = pd.to_numeric(month_data.index)
        month_data = month_data.sort_index()
        month_data.plot(kind='bar')
        plt.title('Month Pattern')
        plt.xlabel('Month')
        plt.ylabel('Proportion')
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "sample_reading_pattern.png"))
        plt.show()
    
    print(f"Generated reading pattern fingerprints for {len(fingerprints)} users")
    return fingerprints

In [14]:
# Generate reading pattern fingerprints
# reading_fingerprints = generate_reading_pattern_fingerprints(reviews_df)

### 6.2 Genre Exploration Index

Measure how much a user explores different genres.

In [15]:
def generate_genre_exploration_indices(reviews_df, books_df, min_interactions=5):
    """Generate genre exploration indices for users.
    
    Args:
        reviews_df (pd.DataFrame): Reviews DataFrame.
        books_df (pd.DataFrame): Books DataFrame with primary_genres column.
        min_interactions (int, optional): Minimum number of interactions per user. Defaults to 5.
        
    Returns:
        dict: Dictionary mapping user IDs to genre exploration indices.
    """
    print("Generating genre exploration indices")
    
    indices = {}
    
    # Ensure primary_genres is available
    if 'primary_genres' not in books_df.columns:
        print("No primary_genres column found. Run content_landscape_mapping first.")
        return indices
    
    # Get active users (with at least min_interactions interactions)
    active_users = reviews_df['user_id'].value_counts()[reviews_df['user_id'].value_counts() >= min_interactions].index
    
    # Generate indices for active users
    for user_id in tqdm(active_users, desc="Generating genre exploration indices"):
        user_books = reviews_df[reviews_df['user_id'] == user_id]['book_id']
        user_genres = []
        
        # Get genres for user's books
        for book_id in user_books:
            book_genres = books_df[books_df['book_id'] == book_id]['primary_genres']
            if not book_genres.empty and isinstance(book_genres.iloc[0], list):
                user_genres.extend(book_genres.iloc[0])
        
        # Skip if no genres found
        if not user_genres:
            continue
        
        # Count unique genres
        unique_genres = len(set(user_genres))
        
        # Calculate entropy of genre distribution
        genre_counts = Counter(user_genres)
        total = sum(genre_counts.values())
        genre_probs = [count / total for count in genre_counts.values()]
        entropy = -sum(p * np.log(p) for p in genre_probs if p > 0)
        
        # Calculate exploration index
        exploration_index = unique_genres * entropy
        
        # Store results
        indices[str(user_id)] = {
            'unique_genres': unique_genres,
            'genre_entropy': entropy,
            'exploration_index': exploration_index,
            'genre_counts': {genre: count for genre, count in genre_counts.items()}
        }
    
    # Visualize distribution of exploration indices
    if indices:
        exploration_values = [data['exploration_index'] for data in indices.values()]
        
        plt.figure(figsize=(12, 6))
        sns.histplot(exploration_values, bins=50, kde=True)
        plt.title('Distribution of Genre Exploration Indices')
        plt.xlabel('Exploration Index')
        plt.ylabel('Number of Users')
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "genre_exploration_distribution.png"))
        plt.show()
        
        # Show correlation between unique genres and entropy
        unique_genres = [data['unique_genres'] for data in indices.values()]
        entropies = [data['genre_entropy'] for data in indices.values()]
        
        plt.figure(figsize=(10, 8))
        plt.scatter(unique_genres, entropies, alpha=0.5)
        plt.title('Relationship Between Unique Genres and Genre Entropy')
        plt.xlabel('Number of Unique Genres')
        plt.ylabel('Genre Entropy')
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "genre_exploration_components.png"))
        plt.show()
    
    print(f"Generated genre exploration indices for {len(indices)} users")
    return indices

In [16]:
# Generate genre exploration indices
# Note: Run content_landscape_mapping first to create primary_genres
# genre_indices = generate_genre_exploration_indices(reviews_df, books_df)

### 6.3 Narrative Complexity Score

Analyze book descriptions for complexity indicators.

In [17]:
def generate_narrative_complexity_scores(books_df):
    """Generate narrative complexity scores for books based on descriptions.
    
    Args:
        books_df (pd.DataFrame): Books DataFrame.
        
    Returns:
        dict: Dictionary mapping book IDs to narrative complexity scores.
    """
    print("Generating narrative complexity scores")
    
    complexity_scores = {}
    
    # Ensure description is available
    if 'description' not in books_df.columns:
        print("No description column found for generating narrative complexity scores")
        return complexity_scores
    
    # Filter books with descriptions
    books_with_desc = books_df.dropna(subset=['description'])
    
    # Define complexity metrics
    def calculate_complexity(description):
        if not isinstance(description, str):
            return 0
        
        # Clean description
        description = re.sub(r'<.*?>', '', description)  # Remove HTML tags
        
        # Count sentences
        sentences = re.split(r'[.!?]+', description)
        sentence_count = len([s for s in sentences if len(s.strip()) > 0])
        
        # Count words
        words = re.findall(r'\b\w+\b', description.lower())
        word_count = len(words)
        
        # Skip if too short
        if sentence_count < 3 or word_count < 20:
            return 0
        
        # Calculate average sentence length
        avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
        
        # Calculate vocabulary richness (unique words ratio)
        unique_words = len(set(words))
        vocabulary_richness = unique_words / word_count if word_count > 0 else 0
        
        # Count named entities (approximation using capitalized words not at start of sentence)
        text_without_sentence_starts = ' '.join([s.strip() for s in sentences])
        potential_entities = re.findall(r'(?<!\. )\b[A-Z][a-z]+\b', text_without_sentence_starts)
        entity_count = len(potential_entities)
        entity_ratio = entity_count / word_count if word_count > 0 else 0
        
        # Calculate complexity score
        complexity = (
            0.3 * min(avg_sentence_length / 20, 1) +  # Normalize to 0-1
            0.4 * vocabulary_richness +
            0.3 * min(entity_ratio * 10, 1)  # Normalize to 0-1
        )
        
        return complexity
    
    # Calculate complexity scores
    for _, book in tqdm(books_with_desc.iterrows(), total=len(books_with_desc), desc="Calculating narrative complexity"):
        book_id = book['book_id']
        description = book['description']
        
        complexity = calculate_complexity(description)
        
        # Store results
        complexity_scores[str(book_id)] = {
            'complexity_score': complexity
        }
    
    # Calculate percentiles
    scores_array = np.array(list(score['complexity_score'] for score in complexity_scores.values()))
    percentiles = {
        str(book_id): {
            'complexity_score': score['complexity_score'],
            'complexity_percentile': np.sum(scores_array <= score['complexity_score']) / len(scores_array) * 100
        }
        for book_id, score in complexity_scores.items()
    }
    
    # Visualize distribution of complexity scores
    if complexity_scores:
        complexity_values = [data['complexity_score'] for data in complexity_scores.values()]
        
        plt.figure(figsize=(12, 6))
        sns.histplot(complexity_values, bins=50, kde=True)
        plt.title('Distribution of Narrative Complexity Scores')
        plt.xlabel('Complexity Score')
        plt.ylabel('Number of Books')
        plt.savefig(os.path.join(OUTPUT_DIR, "plots", "narrative_complexity_distribution.png"))
        plt.show()
        
        # Show relationship between complexity and ratings if available
        if 'average_rating' in books_df.columns:
            # Create a DataFrame with complexity scores and ratings
            complexity_df = pd.DataFrame({
                'book_id': [book_id for book_id in complexity_scores.keys()],
                'complexity_score': [data['complexity_score'] for data in complexity_scores.values()]
            })
            
            # Merge with books DataFrame to get ratings
            complexity_df = complexity_df.merge(
                books_df[['book_id', 'average_rating']],
                on='book_id',
                how='inner'
            )
            
            # Plot relationship
            plt.figure(figsize=(10, 8))
            sns.scatterplot(x='complexity_score', y='average_rating', data=complexity_df, alpha=0.5)
            plt.title('Relationship Between Narrative Complexity and Ratings')
            plt.xlabel('Complexity Score')
            plt.ylabel('Average Rating')
            plt.savefig(os.path.join(OUTPUT_DIR, "plots", "complexity_vs_ratings.png"))
            plt.show()
    
    print(f"Generated narrative complexity scores for {len(complexity_scores)} books")
    return percentiles

In [18]:
# Generate narrative complexity scores
# complexity_scores = generate_narrative_complexity_scores(books_df)

## 7. Identify Most Promising Features

Identify the most promising features based on analysis.

In [19]:
def identify_promising_features(reading_fingerprints=None, genre_indices=None, complexity_scores=None):
    """Identify the most promising features based on analysis.
    
    Args:
        reading_fingerprints (dict, optional): Reading pattern fingerprints. Defaults to None.
        genre_indices (dict, optional): Genre exploration indices. Defaults to None.
        complexity_scores (dict, optional): Narrative complexity scores. Defaults to None.
        
    Returns:
        dict: Dictionary containing promising features analysis.
    """
    print("Identifying promising features")
    
    promising_features = {}
    
    # 1. Analyze reading pattern fingerprints
    if reading_fingerprints:
        # Calculate average patterns
        time_patterns = {}
        day_patterns = {}
        month_patterns = {}
        
        for user_id, data in reading_fingerprints.items():
            for hour, value in data.get('time_pattern', {}).items():
                time_patterns[hour] = time_patterns.get(hour, 0) + value
            
            for day, value in data.get('day_pattern', {}).items():
                day_patterns[day] = day_patterns.get(day, 0) + value
            
            for month, value in data.get('month_pattern', {}).items():
                month_patterns[month] = month_patterns.get(month, 0) + value
        
        # Normalize patterns
        if time_patterns:
            total = sum(time_patterns.values())
            time_patterns = {k: v / total for k, v in time_patterns.items()}
        
        if day_patterns:
            total = sum(day_patterns.values())
            day_patterns = {k: v / total for k, v in day_patterns.items()}
        
        if month_patterns:
            total = sum(month_patterns.values())
            month_patterns = {k: v / total for k, v in month_patterns.items()}
        
        promising_features['reading_patterns'] = {
            'time_patterns': time_patterns,
            'day_patterns': day_patterns,
            'month_patterns': month_patterns,
            'user_count': len(reading_fingerprints),
            'promise_score': 8.5,
            'rationale': "Reading pattern fingerprints capture temporal behavior that's highly predictive of user engagement and can help optimize recommendation timing."
        }
    
    # 2. Analyze genre exploration indices
    if genre_indices:
        # Calculate statistics
        exploration_scores = [data.get('exploration_index', 0) for data in genre_indices.values()]
        
        if exploration_scores:
            promising_features['genre_exploration'] = {
                'mean_exploration_index': np.mean(exploration_scores),
                'median_exploration_index': np.median(exploration_scores),
                'std_exploration_index': np.std(exploration_scores),
                'user_count': len(genre_indices),
                'promise_score': 9.0,
                'rationale': "Genre exploration index effectively captures user openness to diverse recommendations and can help balance familiarity with discovery."
            }
    
    # 3. Analyze narrative complexity scores
    if complexity_scores:
        # Calculate statistics
        complexity_metrics = [data.get('complexity_score', 0) for data in complexity_scores.values()]
        
        if complexity_metrics:
            promising_features['narrative_complexity'] = {
                'mean_complexity_score': np.mean(complexity_metrics),
                'median_complexity_score': np.median(complexity_metrics),
                'std_complexity_score': np.std(complexity_metrics),
                'book_count': len(complexity_scores),
                'promise_score': 7.5,
                'rationale': "Narrative complexity provides a dimension beyond genre for matching books to reader preferences and can help identify content similarity not captured by metadata."
            }
    
    # Add other promising features
    promising_features['content_behavior_alignment'] = {
        'promise_score': 9.5,
        'rationale': "Content-behavior alignment reveals the gap between stated and revealed preferences, which is crucial for understanding user satisfaction with recommendations."
    }
    
    promising_features['trend_adoption'] = {
        'promise_score': 8.0,
        'rationale': "Trend adoption timing identifies early adopters who can serve as recommendation seeds and helps tailor content freshness to user preferences."
    }
    
    # Rank features by promise score
    ranked_features = sorted(
        [(name, data['promise_score'], data['rationale']) 
         for name, data in promising_features.items()],
        key=lambda x: x[1],
        reverse=True
    )
    
    promising_features['ranked_features'] = [
        {'name': name, 'score': score, 'rationale': rationale}
        for name, score, rationale in ranked_features
    ]
    
    # Print ranked features
    print("\nMost Promising Features (Ranked):")
    for i, feature in enumerate(promising_features['ranked_features'], 1):
        print(f"\n{i}. {feature['name']} (Score: {feature['score']}/10)")
        print(f"   {feature['rationale']}")
    
    return promising_features

In [20]:
# Identify promising features
# Note: Run the feature generation cells first
# promising_features = identify_promising_features(
#     reading_fingerprints=reading_fingerprints,
#     genre_indices=genre_indices,
#     complexity_scores=complexity_scores
# )

## 8. Run Full Analysis

Run the full exploratory data analysis pipeline.

In [21]:
def run_full_analysis(books_df, reviews_df=None, interactions_df=None):
    """Run the full exploratory data analysis pipeline.
    
    Args:
        books_df (pd.DataFrame): Books DataFrame.
        reviews_df (pd.DataFrame, optional): Reviews DataFrame. Defaults to None.
        interactions_df (pd.DataFrame, optional): Interactions DataFrame. Defaults to None.
        
    Returns:
        dict: Dictionary containing all analysis results.
    """
    print("Running full exploratory data analysis")
    
    results = {}
    
    # Phase 1: Dataset Integrity Analysis
    results['integrity_analysis'] = dataset_integrity_analysis(books_df, reviews_df, interactions_df)
    
    # Phase 1: User Behavior Profiling
    results['user_behavior_profiling'] = user_behavior_profiling(reviews_df, interactions_df)
    
    # Phase 1: Content Landscape Mapping
    results['content_landscape_mapping'] = content_landscape_mapping(books_df)
    
    # Generate Innovative Features
    reading_fingerprints = generate_reading_pattern_fingerprints(reviews_df)
    genre_indices = generate_genre_exploration_indices(reviews_df, books_df)
    complexity_scores = generate_narrative_complexity_scores(books_df)
    
    results['reading_fingerprints'] = reading_fingerprints
    results['genre_indices'] = genre_indices
    results['complexity_scores'] = complexity_scores
    
    # Identify Promising Features
    results['promising_features'] = identify_promising_features(
        reading_fingerprints=reading_fingerprints,
        genre_indices=genre_indices,
        complexity_scores=complexity_scores
    )
    
    # Generate summary report
    generate_summary_report(results, books_df, reviews_df, interactions_df)
    
    print("Full exploratory data analysis completed")
    return results

In [22]:
def generate_summary_report(results, books_df, reviews_df=None, interactions_df=None):
    """Generate a summary report of the exploratory data analysis.
    
    Args:
        results (dict): Dictionary containing all analysis results.
        books_df (pd.DataFrame): Books DataFrame.
        reviews_df (pd.DataFrame, optional): Reviews DataFrame. Defaults to None.
        interactions_df (pd.DataFrame, optional): Interactions DataFrame. Defaults to None.
    """
    print("Generating summary report")
    
    report = []
    
    # Add header
    report.append("# Goodreads Fantasy & Paranormal Dataset Analysis Report")
    report.append("\n## Summary")
    
    # Add dataset summary
    report.append(f"\n- **Books**: {len(books_df):,}")
    
    if interactions_df is not None:
        report.append(f"- **Interactions**: {len(interactions_df):,}")
    
    if reviews_df is not None:
        report.append(f"- **Reviews**: {len(reviews_df):,}")
        
        if 'user_id' in reviews_df.columns:
            report.append(f"- **Users**: {reviews_df['user_id'].nunique():,}")
    
    # Add integrity analysis summary
    if 'integrity_analysis' in results:
        report.append("\n## Dataset Integrity")
        
        if 'books_missing_percentages' in results['integrity_analysis']:
            missing_books = results['integrity_analysis']['books_missing_percentages']
            report.append("\n### Missing Data in Books")
            for field, pct in sorted(missing_books.items(), key=lambda x: x[1], reverse=True):
                if pct > 0:
                    report.append(f"- **{field}**: {pct:.2f}%")
        
        if 'duplicate_books' in results['integrity_analysis']:
            report.append(f"\n- **Duplicate Books**: {results['integrity_analysis']['duplicate_books']}")
    
    # Add user behavior summary
    if 'user_behavior_profiling' in results:
        report.append("\n## User Behavior")
        
        if 'user_segments' in results['user_behavior_profiling']:
            segments = results['user_behavior_profiling']['user_segments']
            report.append("\n### User Segments")
            for segment, count in segments.items():
                report.append(f"- **{segment}**: {int(count):,} users")
        
        if 'avg_books_per_month' in results['user_behavior_profiling']:
            report.append(f"\n- **Average Books per Month**: {results['user_behavior_profiling']['avg_books_per_month']:.2f}")
    
    # Add content landscape summary
    if 'content_landscape_mapping' in results:
        report.append("\n## Content Landscape")
        
        if 'top_genres' in results['content_landscape_mapping']:
            top_genres = results['content_landscape_mapping']['top_genres']
            report.append("\n### Top Genres")
            for genre, count in list(sorted(top_genres.items(), key=lambda x: int(x[1]), reverse=True))[:10]:
                report.append(f"- **{genre}**: {int(count):,} books")
        
        if 'author_productivity_popularity_correlation' in results['content_landscape_mapping']:
            corr = results['content_landscape_mapping']['author_productivity_popularity_correlation']
            report.append(f"\n- **Author Productivity-Popularity Correlation**: {corr:.2f}")
    
    # Add promising features summary
    if 'promising_features' in results and 'ranked_features' in results['promising_features']:
        report.append("\n## Most Promising Features")
        
        for feature in results['promising_features']['ranked_features']:
            report.append(f"\n### {feature['name']} (Score: {feature['score']:.1f}/10)")
            report.append(f"\n{feature['rationale']}")
    
    # Write report to file
    with open(os.path.join(OUTPUT_DIR, "analysis_report.md"), 'w') as f:
        f.write("\n".join(report))
    
    print(f"Summary report generated and saved to {os.path.join(OUTPUT_DIR, 'analysis_report.md')}")

In [23]:
# Run the full analysis
# Note: This will run all the previous analyses and may take some time
# all_results = run_full_analysis(books_df, reviews_df, interactions_df)

## 9. Conclusion

This notebook has implemented Phase 1 of exploratory data analysis for the Goodreads Fantasy & Paranormal dataset and generated innovative features that can be used for recommendation systems.

The most promising features identified are:

1. **Content-Behavior Alignment**: Reveals the gap between stated and revealed preferences
2. **Genre Exploration Index**: Captures user openness to diverse recommendations
3. **Reading Pattern Fingerprints**: Captures temporal behavior patterns
4. **Trend Adoption Timing**: Identifies early adopters vs. late majority readers
5. **Narrative Complexity**: Provides a dimension beyond genre for matching books

These features can be integrated into recommendation models to improve personalization and recommendation quality.