In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


<br>
Goodreads Fantasy Books Exploratory Data Analysis<br>
------------------------------------------------<br>
This script explores various aspects of the Goodreads fantasy books dataset:<br>
1. Basic Statistics and Distributions<br>
2. User Behavior Analysis<br>
3. Book Characteristics<br>
4. Network Analysis<br>
Author: Nicolas<br>


In [None]:
import datetime as dt
import json
import os
from collections import Counter

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm

Set visualization style

In [None]:
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (12, 8)
sns.set(font_scale=1.2)

Pandas display options

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

Data paths

In [None]:
DATA_DIR = "data/interim/goodreads/"
BOOKS_FILE = os.path.join(DATA_DIR, "fantasy_books_filtered.json")
REVIEWS_FILE = os.path.join(DATA_DIR, "fantasy_reviews_filtered.json")
INTERACTIONS_FILE = os.path.join(DATA_DIR, "fantasy_interactions_filtered.json")

In [None]:
def load_json_in_chunks(filename, chunk_size=10000, max_rows=None):
    """Load JSON data in chunks for memory efficiency."""
    data = []
    with open(filename, "r") as f:
        chunk = []
        for i, line in enumerate(tqdm(f, desc=f"Loading {os.path.basename(filename)}")):
            if max_rows and i >= max_rows:
                break
            if line.strip():
                try:
                    chunk.append(json.loads(line))
                except json.JSONDecodeError:
                    continue
            if len(chunk) >= chunk_size:
                data.extend(chunk)
                chunk = []
        if chunk:
            data.extend(chunk)
    return data

In [None]:
def load_sample_data(sample_size=1000):
    """Load sample data from all three datasets."""
    print("Loading samples of each dataset...")
    books_sample = load_json_in_chunks(BOOKS_FILE, max_rows=sample_size)
    reviews_sample = load_json_in_chunks(REVIEWS_FILE, max_rows=sample_size)
    interactions_sample = load_json_in_chunks(INTERACTIONS_FILE, max_rows=sample_size)

    # Convert to dataframes
    df_books = pd.DataFrame(books_sample)
    df_reviews = pd.DataFrame(reviews_sample)
    df_interactions = pd.DataFrame(interactions_sample)
    return df_books, df_reviews, df_interactions

In [None]:
def load_larger_data(books_max=None, reviews_max=50000, interactions_max=100000):
    """Load larger samples for analysis."""
    print("Loading larger dataset samples...")

    # Books dataset should be manageable in full
    books_data = load_json_in_chunks(BOOKS_FILE, max_rows=books_max)
    df_books = pd.DataFrame(books_data)
    print(f"Loaded {len(df_books)} books")

    # For interactions and reviews, load samples
    interactions_data = load_json_in_chunks(
        INTERACTIONS_FILE, max_rows=interactions_max
    )
    df_interactions = pd.DataFrame(interactions_data)
    print(f"Loaded {len(df_interactions)} interactions")
    reviews_data = load_json_in_chunks(REVIEWS_FILE, max_rows=reviews_max)
    df_reviews = pd.DataFrame(reviews_data)
    print(f"Loaded {len(df_reviews)} reviews")
    return df_books, df_reviews, df_interactions

In [None]:
def explore_data_structure(df_books, df_reviews, df_interactions):
    """Print information about the data structure."""
    print("\n--- DATA STRUCTURE EXPLORATION ---")
    print("\nBooks dataset schema:")
    print(df_books.columns.tolist())
    print("\nReviews dataset schema:")
    print(df_reviews.columns.tolist())
    print("\nInteractions dataset schema:")
    print(df_interactions.columns.tolist())
    print("\nSample books data:")
    print(df_books.head(2).to_string())
    print("\nSample reviews data:")
    print(df_reviews.head(2).to_string())
    print("\nSample interactions data:")
    print(df_interactions.head(2).to_string())

In [None]:
def analyze_user_activity(df_interactions, df_reviews):
    """Analyze user activity distribution."""
    print("\n--- 1.1 USER ACTIVITY ANALYSIS ---")

    # Use reviews instead since interactions doesn't have user_id
    user_interaction_counts = df_reviews["user_id"].value_counts()

    # Summary statistics
    print("\nUser Interaction Statistics:")
    print(f"Total unique users: {len(user_interaction_counts)}")
    print(user_interaction_counts.describe())

    # Visualize the distribution
    plt.figure(figsize=(12, 6))

    # Left plot: Regular histogram
    plt.subplot(1, 2, 1)
    sns.histplot(user_interaction_counts, bins=50, kde=True)
    plt.title("Distribution of Interactions per User")
    plt.xlabel("Number of Interactions")
    plt.ylabel("Count of Users")

    # Right plot: Log scale for better visualization of distribution tail
    plt.subplot(1, 2, 2)
    sns.histplot(user_interaction_counts, bins=50, kde=True, log_scale=(False, True))
    plt.title("Distribution of Interactions per User (Log Scale)")
    plt.xlabel("Number of Interactions")
    plt.ylabel("Count of Users (Log Scale)")
    plt.tight_layout()
    plt.savefig("user_activity_distribution.png")
    plt.close()

    # Show the most active users
    print("\nTop 10 Most Active Users:")
    print(user_interaction_counts.head(10))

In [None]:
def analyze_book_popularity(df_interactions, df_books, df_reviews):
    """Analyze book popularity distribution."""
    print("\n--- 1.2 BOOK POPULARITY ANALYSIS ---")

    # Use reviews for interaction counts instead
    book_interaction_counts = df_reviews["book_id"].value_counts()

    # Summary statistics
    print("\nBook Interaction Statistics:")
    print(f"Total unique books with interactions: {len(book_interaction_counts)}")
    print(book_interaction_counts.describe())

    # Visualize the distribution
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(book_interaction_counts, bins=50, kde=True)
    plt.title("Distribution of Interactions per Book")
    plt.xlabel("Number of Interactions")
    plt.ylabel("Count of Books")
    plt.subplot(1, 2, 2)
    sns.histplot(book_interaction_counts, bins=50, kde=True, log_scale=(False, True))
    plt.title("Distribution of Interactions per Book (Log Scale)")
    plt.xlabel("Number of Interactions")
    plt.ylabel("Count of Books (Log Scale)")
    plt.tight_layout()
    plt.savefig("book_popularity_distribution.png")
    plt.close()

    # Get the most popular books
    try:
        top_book_ids = book_interaction_counts.head(10).index.tolist()
        popular_books = df_books[df_books["book_id"].isin(top_book_ids)][
            ["book_id", "title", "authors"]
        ]

        # Add interaction counts
        popular_books_with_counts = popular_books.copy()
        popular_books_with_counts["interaction_count"] = popular_books_with_counts[
            "book_id"
        ].apply(lambda x: book_interaction_counts.get(x, 0))
        popular_books_with_counts = popular_books_with_counts.sort_values(
            "interaction_count", ascending=False
        )
        print("\nTop 10 Most Popular Books:")
        print(popular_books_with_counts.to_string())
    except KeyError as e:
        print(f"Could not identify popular books due to: {e}")
        print("Available columns in books dataset:", df_books.columns.tolist())

In [None]:
def analyze_rating_distribution(df_interactions, df_reviews):
    """Analyze rating distribution."""
    print("\n--- 1.3 RATING DISTRIBUTION ANALYSIS ---")

    # Check if ratings are available in interactions or reviews
    if "rating" in df_interactions.columns:
        ratings = df_interactions["rating"]
        rating_source = "interactions"
    elif "rating" in df_reviews.columns:
        ratings = df_reviews["rating"]
        rating_source = "reviews"
    else:
        print("No rating column found in either interactions or reviews.")
        return

    # Get basic statistics
    print(f"\nRating Statistics (from {rating_source}):")
    print(ratings.describe())

    # Plot the distribution
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(ratings, bins=10, kde=True)
    plt.title("Distribution of Ratings")
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.subplot(1, 2, 2)
    rating_counts = ratings.value_counts().sort_index()
    sns.barplot(x=rating_counts.index, y=rating_counts.values)
    plt.title("Rating Distribution")
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig("rating_distribution.png")
    plt.close()

In [None]:
def analyze_temporal_patterns(df_interactions, df_reviews):
    """Analyze temporal patterns in interactions."""
    print("\n--- 1.4 TEMPORAL PATTERNS ANALYSIS ---")

    # Use reviews for temporal analysis instead of interactions
    date_col = "date_added" if "date_added" in df_reviews.columns else None
    if not date_col:
        print("No date/time column found for temporal analysis.")
        return
    try:
        # Convert to datetime
        df_reviews[date_col] = pd.to_datetime(df_reviews[date_col], errors="coerce")

        # Use reviews dataframe for the rest of the analysis
        df_time = df_reviews.dropna(subset=[date_col])

        # Create monthly time series
        df_time["year_month"] = df_time[date_col].dt.to_period("M")
        monthly_counts = df_time["year_month"].value_counts().sort_index()
        monthly_df = pd.DataFrame({"count": monthly_counts})
        monthly_df.index = monthly_df.index.to_timestamp()

        # Plot monthly trend
        plt.figure(figsize=(14, 6))
        plt.plot(
            monthly_df.index, monthly_df["count"], marker="o", linestyle="-", alpha=0.7
        )
        plt.title("Monthly Interactions with Fantasy Books")
        plt.xlabel("Date")
        plt.ylabel("Number of Interactions")
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig("monthly_interactions.png")
        plt.close()

        # Yearly trend
        df_time["year"] = df_time[date_col].dt.year
        yearly_counts = df_time["year"].value_counts().sort_index()
        plt.figure(figsize=(12, 6))
        yearly_counts.plot(kind="bar")
        plt.title("Yearly Interactions with Fantasy Books")
        plt.xlabel("Year")
        plt.ylabel("Number of Interactions")
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig("yearly_interactions.png")
        plt.close()
        print("\nTemporal analysis completed and saved as images.")
    except Exception as e:
        print(f"Error in temporal analysis: {e}")

In [None]:
def analyze_user_reading_patterns(df_reviews):
    """Analyze user reading patterns over time."""
    print("\n--- 2.1 USER READING PATTERNS ANALYSIS ---")

    # Check for date column in reviews instead of interactions
    date_col = "date_added" if "date_added" in df_reviews.columns else None
    if not date_col:
        print("No date/time column found for reading pattern analysis.")
        return
    try:
        # Convert to datetime
        df_reviews[date_col] = pd.to_datetime(df_reviews[date_col], errors="coerce")

        # Drop rows with invalid dates
        df_time = df_reviews.dropna(subset=[date_col])

        # Group by user and month
        df_time["year_month"] = df_time[date_col].dt.to_period("M")
        books_per_user_per_month = (
            df_time.groupby(["user_id", "year_month"])["book_id"].count().reset_index()
        )
        books_per_user_per_month.columns = ["user_id", "year_month", "books_count"]

        # Calculate average books per user per month
        avg_books_per_month = books_per_user_per_month["books_count"].mean()
        median_books_per_month = books_per_user_per_month["books_count"].median()
        print(f"\nAverage books per user per month: {avg_books_per_month:.2f}")
        print(f"Median books per user per month: {median_books_per_month:.2f}")

        # Plot distribution
        plt.figure(figsize=(12, 6))
        sns.histplot(books_per_user_per_month["books_count"], bins=30, kde=True)
        plt.axvline(
            avg_books_per_month,
            color="red",
            linestyle="--",
            label=f"Mean = {avg_books_per_month:.2f}",
        )
        plt.axvline(
            median_books_per_month,
            color="green",
            linestyle="-",
            label=f"Median = {median_books_per_month:.2f}",
        )
        plt.title("Distribution of Books Read per User per Month")
        plt.xlabel("Number of Books")
        plt.ylabel("Count")
        plt.legend()
        plt.savefig("books_per_month_distribution.png")
        plt.close()

        # Yearly analysis
        df_time["year"] = df_time[date_col].dt.year
        books_per_user_per_year = (
            df_time.groupby(["user_id", "year"])["book_id"].count().reset_index()
        )
        books_per_user_per_year.columns = ["user_id", "year", "books_count"]
        avg_books_per_year = books_per_user_per_year["books_count"].mean()
        median_books_per_year = books_per_user_per_year["books_count"].median()
        print(f"\nAverage books per user per year: {avg_books_per_year:.2f}")
        print(f"Median books per user per year: {median_books_per_year:.2f}")
        print("\nReading patterns analysis completed and saved as images.")
    except Exception as e:
        print(f"Error in reading patterns analysis: {e}")

In [None]:
def analyze_user_rating_behavior(df_interactions, df_reviews):
    """Analyze user rating behavior."""
    print("\n--- 2.2 USER RATING BEHAVIOR ANALYSIS ---")

    # Check if ratings are available
    if "rating" in df_interactions.columns and "user_id" in df_interactions.columns:
        df_ratings = df_interactions[["user_id", "book_id", "rating"]].dropna(
            subset=["rating"]
        )
        rating_source = "interactions"
    elif "rating" in df_reviews.columns and "user_id" in df_reviews.columns:
        df_ratings = df_reviews[["user_id", "book_id", "rating"]].dropna(
            subset=["rating"]
        )
        rating_source = "reviews"
    else:
        print("No rating data available for user rating behavior analysis.")
        return
    try:
        # Calculate user rating statistics
        user_rating_stats = (
            df_ratings.groupby("user_id")["rating"]
            .agg(["count", "mean", "std", "min", "max"])
            .reset_index()
        )
        user_rating_stats = user_rating_stats[
            user_rating_stats["count"] >= 5
        ]  # Users with at least 5 ratings
        print(f"\nUser Rating Behavior Statistics (from {rating_source}):")
        print(user_rating_stats.describe().to_string())

        # Plot distribution of mean ratings
        plt.figure(figsize=(12, 6))
        sns.histplot(user_rating_stats["mean"], bins=30, kde=True)
        plt.title("Distribution of User Average Ratings")
        plt.xlabel("Average Rating")
        plt.ylabel("Count of Users")
        plt.savefig("user_mean_ratings_distribution.png")
        plt.close()

        # Plot distribution of rating standard deviations
        plt.figure(figsize=(12, 6))
        sns.histplot(user_rating_stats["std"].dropna(), bins=30, kde=True)
        plt.title("Distribution of User Rating Standard Deviations")
        plt.xlabel("Standard Deviation")
        plt.ylabel("Count of Users")
        plt.savefig("user_rating_std_distribution.png")
        plt.close()

        # User rating bias
        overall_mean_rating = df_ratings["rating"].mean()
        user_rating_stats["rating_bias"] = (
            user_rating_stats["mean"] - overall_mean_rating
        )
        print(f"\nOverall mean rating: {overall_mean_rating:.2f}")
        print(
            "\nTop 5 Users with Positive Rating Bias (rate books higher than average):"
        )
        print(
            user_rating_stats.nlargest(5, "rating_bias")[
                ["user_id", "count", "mean", "rating_bias"]
            ].to_string()
        )
        print(
            "\nTop 5 Users with Negative Rating Bias (rate books lower than average):"
        )
        print(
            user_rating_stats.nsmallest(5, "rating_bias")[
                ["user_id", "count", "mean", "rating_bias"]
            ].to_string()
        )
        print("\nUser rating behavior analysis completed and saved as images.")
    except Exception as e:
        print(f"Error in user rating behavior analysis: {e}")

In [None]:
def analyze_genre_distribution(df_books):
    """Analyze genre distribution in books."""
    print("\n--- 3.1 GENRE DISTRIBUTION ANALYSIS ---")

    # Look for genre information
    genre_cols = [
        col
        for col in df_books.columns
        if "genre" in col.lower() or "categor" in col.lower()
    ]
    genre_col = genre_cols[0] if genre_cols else None
    if not genre_col:
        print("No genre information found in books dataset.")
        # Try to find genres in other ways (e.g., in book tags or shelves)
        alt_genre_cols = [
            col
            for col in df_books.columns
            if "tag" in col.lower() or "shelf" in col.lower()
        ]
        genre_col = alt_genre_cols[0] if alt_genre_cols else None
        if not genre_col:
            print("No alternative genre information found.")
            return
    try:
        print(f"Using '{genre_col}' for genre analysis.")

        # Extract genres - handling different possible data structures
        genres = []
        for genre_data in df_books[genre_col]:
            if isinstance(genre_data, list):
                genres.extend(genre_data)
            elif isinstance(genre_data, str):
                try:
                    # Try to parse as JSON if it's a string representation of a list
                    parsed = json.loads(genre_data)
                    if isinstance(parsed, list):
                        genres.extend(parsed)
                    else:
                        genres.append(genre_data)
                except:
                    genres.append(genre_data)
            elif isinstance(genre_data, dict):
                genres.extend(genre_data.keys())

        # Count genres
        genre_counts = Counter(genres)

        # Get top genres
        top_genres = genre_counts.most_common(20)
        print("\nTop 20 Genres:")
        for genre, count in top_genres:
            print(f"{genre}: {count}")

        # Plot genre distribution
        plt.figure(figsize=(14, 8))
        genre_df = pd.DataFrame(top_genres, columns=["Genre", "Count"])
        sns.barplot(x="Count", y="Genre", data=genre_df)
        plt.title("Top 20 Genres in Fantasy Books")
        plt.xlabel("Count")
        plt.ylabel("Genre")
        plt.tight_layout()
        plt.savefig("genre_distribution.png")
        plt.close()
        print("\nGenre distribution analysis completed and saved as image.")
    except Exception as e:
        print(f"Error in genre distribution analysis: {e}")

In [None]:
def analyze_publication_trends(df_books):
    """Analyze publication year trends."""
    print("\n--- 3.2 PUBLICATION YEAR TRENDS ---")

    # Find publication year column
    pub_year_cols = [
        col
        for col in df_books.columns
        if ("year" in col.lower() and "pub" in col.lower())
        or "published" in col.lower()
    ]
    pub_year_col = pub_year_cols[0] if pub_year_cols else None
    if not pub_year_col:
        print("No publication year column found.")
        return
    try:
        # Extract years as integers
        years = pd.to_numeric(df_books[pub_year_col], errors="coerce")
        valid_years = years[(years > 1800) & (years <= dt.datetime.now().year)]
        print("\nPublication Year Statistics:")
        print(valid_years.describe())

        # Plot publication year distribution
        plt.figure(figsize=(14, 6))
        sns.histplot(valid_years, bins=30, kde=True)
        plt.title("Distribution of Publication Years")
        plt.xlabel("Publication Year")
        plt.ylabel("Count of Books")
        plt.savefig("publication_year_distribution.png")
        plt.close()

        # Analyze ratings by publication decade if rating data is available in books
        if "average_rating" in df_books.columns:
            df_books["publication_decade"] = (valid_years // 10) * 10
            decade_ratings = df_books.groupby("publication_decade")[
                "average_rating"
            ].mean()
            plt.figure(figsize=(12, 6))
            decade_ratings.plot(kind="bar")
            plt.title("Average Rating by Publication Decade")
            plt.xlabel("Publication Decade")
            plt.ylabel("Average Rating")
            plt.tight_layout()
            plt.savefig("ratings_by_decade.png")
            plt.close()
            print("\nAverage Rating by Publication Decade:")
            print(decade_ratings)
        print("\nPublication trends analysis completed and saved as images.")
    except Exception as e:
        print(f"Error in publication trends analysis: {e}")

## 1.Explore the Data Structure


In [None]:
df_books_sample, df_reviews_sample, df_interactions_sample = load_sample_data()
explore_data_structure(df_books_sample, df_reviews_sample, df_interactions_sample)
df_books, df_reviews, df_interactions = load_larger_data()




## 2. Exploratory Data Analysis

In [None]:
analyze_user_activity(df_interactions_sample, df_reviews_sample)
analyze_book_popularity(df_interactions_sample, df_books_sample, df_reviews_sample)
analyze_rating_distribution(df_interactions_sample, df_reviews_sample)

## 3. Book Characteristics Analysis

In [None]:
analyze_genre_distribution(df_books)
analyze_publication_trends(df_books)

## User Behaviour

In [None]:
def main():
    """Main function to run all analyses."""
    print("=== GOODREADS FANTASY BOOKS EXPLORATORY DATA ANALYSIS ===")

    # Create output directory for plots if it doesn't exist
    if not os.path.exists("plots"):
        os.makedirs("plots")
    try:
        # Load sample data first to understand structure
        df_books_sample, df_reviews_sample, df_interactions_sample = load_sample_data()

        # Explore data structure
        explore_data_structure(
            df_books_sample, df_reviews_sample, df_interactions_sample
        )

        # Load larger datasets for analysis
        df_books, df_reviews, df_interactions = load_larger_data()

        # 1. Basic Statistics and Distributions
        analyze_user_activity(df_interactions, df_reviews)
        analyze_book_popularity(df_interactions, df_books, df_reviews)
        analyze_rating_distribution(df_interactions, df_reviews)
        analyze_temporal_patterns(df_interactions, df_reviews)

        # 2. User Behavior Analysis
        analyze_user_reading_patterns(df_reviews)
        analyze_user_rating_behavior(df_interactions, df_reviews)

        # 3. Book Characteristics Analysis
        analyze_genre_distribution(df_books)
        analyze_publication_trends(df_books)
        print("\n=== ANALYSIS COMPLETE ===")
        print("All visualizations have been saved as PNG files.")
    except Exception as e:
        print(f"An error occurred: {e}")
        raise