In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

df = pd.read_csv("kindle_data.csv")

#Trim whitespaces frim column names -> durch die Erweiterung Rainbow sind whitespaces enstanden, die raus müssen
df.columns = df.columns.str.strip()
df.columns

df.head()

#remove column

df_new = df.drop(["imgUrl", "productURL"], axis=1)
df_new.head()

#in Dataframe df = Replace im Dataframe df die NaN -> er würde sonst das NaN nicht erkennen
df_new = df_new.replace("NaN", np.nan)

#check for None/NaN
print(df.isnull().sum())

#Reihen mit NaN entfernen
df_new.dropna(inplace=True)

df_new.head()

#Dataset auf doppelte Titel testen
title_counts = df_new["title"].value_counts()
num_duplicates = len(title_counts[title_counts > 1])

print(title_counts)

print(f"\nEs sind {num_duplicates} doppelte Titel enthalten")

#Duplikate filtern

df_new_unique = df_new.drop_duplicates(subset=["title"])

df_new_unique.head()

df_new_unique = df_new_unique[df_new_unique['reviews'] >= 648]

#df_new_unique['reviews'].describe()

df_new_unique.sort_values("reviews", ascending=False)

In [None]:
plt.figure(figsize=(10, 15))

# Select top N authors with the most bestsellers
top_authors = df_new_unique['author'].value_counts().head(20).index
sns.countplot(data=df_new_unique[df_new_unique['author'].isin(top_authors)], y="author", order=top_authors)

plt.title("Top 20 Authors by Number of Bestsellers")
plt.ylabel("Author")
plt.xlabel("Number of Bestsellers")

plt.tight_layout()
plt.show()


In [None]:
import plotly.express as px

# Get the count of bestsellers by author
author_counts = df_new_unique['author'].value_counts().reset_index()
author_counts.columns = ['Author', 'Number of Bestsellers']

# Create the plotly figure
fig = px.bar(author_counts.head(100),  # Display the top 50 authors for example
             y='Author', 
             x='Number of Bestsellers', 
             orientation='h',
             title='Top Authors by Number of Bestsellers')

fig.update_layout(
    yaxis={'categoryorder':'total ascending'},
    height=800,  # Adjust the height to make it more readable and scrollable
)

fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Create the count data from your DataFrame
author_counts = df_new_unique['author'].value_counts().reset_index()
author_counts.columns = ['Author', 'Number of Bestsellers']

# Set page size and initialize start index
page_size = 100
start_index = 0

def plot_page(start_index=0, page_size=100):
    # Get the subset of data for the current page
    author_subset = author_counts.iloc[start_index:start_index + page_size]
    
    fig = px.bar(author_subset,
                 y='Author',
                 x='Number of Bestsellers',
                 orientation='h',
                 title=f'Top Authors by Number of Bestsellers (Showing {start_index + 1} to {min(start_index + page_size, len(author_counts))})')
    
    fig.update_layout(
        yaxis={'categoryorder': 'total ascending', 'autorange': 'reversed'},
        height=800  # Adjust height based on the number of items per page
    )
    
    fig.show()

# Function to handle pagination
def paginate(action):
    global start_index
    if action == 'next' and start_index + page_size < len(author_counts):
        start_index += page_size
    elif action == 'prev' and start_index - page_size >= 0:
        start_index -= page_size
    else:
        print("No more pages in this direction.")
    
    plot_page(start_index, page_size)

# Display the first page
plot_page(start_index=start_index, page_size=page_size)


In [None]:
# This will plot the next 100
paginate('next')



### 1. **Pagination**
   - **Concept**: Display data in smaller chunks (pages) rather than all at once. Users can navigate through these pages.
   - **Implementation**: The example I provided above shows how to implement pagination using Plotly in Python.
   - **Pros**: Allows users to interact with large datasets without overwhelming the viewer or system resources.
   - **Cons**: Not suitable for an overview; it's more for detailed, segmented analysis.

### 2. **Zooming and Panning**
   - **Concept**: Allow users to zoom into specific areas of the graph and pan across the data. Most modern libraries, like Plotly and Bokeh, support interactive zooming and panning out of the box.
   - **Implementation**: Simply enabling the interactive mode in Plotly, matplotlib (`%matplotlib notebook` in Jupyter), or using `plotly.express` with large data allows for zooming.
   - **Pros**: Gives the ability to explore detailed sections without losing context of the whole dataset.
   - **Cons**: May still be slow to render initially if the dataset is extremely large.

### 3. **Filtering**
   - **Concept**: Allow users to filter data based on certain criteria before displaying it. For example, you might want to display only authors with more than a certain number of bestsellers.
   - **Implementation**: Add dropdowns or sliders to filter data dynamically. This can be done with libraries like Plotly’s interactive widgets, Dash, or Streamlit.
   - **Pros**: Helps in focusing on relevant data and reduces clutter.
   - **Cons**: Requires some interactive control, so it might not be as suitable for static presentations.

### 4. **Aggregation and Binning**
   - **Concept**: Aggregate or bin data into summarized categories. For example, group authors by letter or count them into ranges.
   - **Implementation**: Use pandas’ `groupby` function to aggregate data before plotting or use histograms and heatmaps to display aggregated data.
   - **Pros**: Reduces data size while preserving key trends and patterns.
   - **Cons**: You lose granular detail, and it might not be suitable for situations requiring precise information.

### 5. **Sampling**
   - **Concept**: Select a representative subset of data points instead of plotting the entire dataset.
   - **Implementation**: Use pandas’ `.sample()` method to extract a random sample of your data.
   - **Pros**: Reduces rendering time and complexity while still representing the general pattern.
   - **Cons**: Important outliers might be missed if not carefully sampled.

### 6. **Chunk Processing**
   - **Concept**: Process and visualize the data in chunks. You only load and visualize a manageable portion of the data at a time.
   - **Implementation**: Load data from disk in smaller chunks using libraries like Dask or by processing data in SQL queries with `LIMIT`.
   - **Pros**: Handles very large datasets that don’t fit into memory.
   - **Cons**: Increases complexity in handling data and might miss overall trends if not implemented carefully.

### 7. **Hierarchical Visualization (Drill-down Approach)**
   - **Concept**: Start with a high-level aggregated view (e.g., country-level data) and then allow users to drill down to more detailed levels (e.g., city, street).
   - **Implementation**: Tools like Plotly, Tableau, or PowerBI support hierarchical visualization natively.
   - **Pros**: Provides a multi-layered approach to explore data at different granularities.
   - **Cons**: Requires some setup and configuration; less effective in purely static environments.

### 8. **Using Interactive Visualization Libraries**
   - **Concept**: Use advanced visualization libraries that handle big data more efficiently with interactivity, such as:
     - **Plotly**: Great for interactive, zoomable, and filterable plots.
     - **Bokeh**: Supports interactive visualization with more control over details.
     - **Dash**: Build interactive web applications with custom filtering.
     - **Datashader**: Specifically designed for rendering large datasets quickly, capable of visualizing millions of points.
   - **Pros**: Rich interactivity and flexibility.
   - **Cons**: Requires familiarity with the library and sometimes more complex code.

### 9. **Heatmaps or Density Plots**
   - **Concept**: Use heatmaps or density plots to visualize large amounts of data points, where color intensity represents data density.
   - **Implementation**: Use `sns.heatmap()` in Seaborn or `px.density_heatmap()` in Plotly.
   - **Pros**: Effective for showing patterns and trends in large datasets.
   - **Cons**: Aggregates data into bins, so you lose individual data point precision.

### 10. **Scatter Plot with Alpha Blending**
   - **Concept**: Reduce the opacity (alpha) of points in a scatter plot so overlapping areas appear darker, allowing you to spot high-density regions.
   - **Implementation**: Set `alpha` parameter in matplotlib/Seaborn or `opacity` in Plotly.
   - **Pros**: Shows density of data points while still allowing all data to be plotted.
   - **Cons**: Can become hard to read if the dataset is extremely dense.

### 11. **Lazy Loading**
   - **Concept**: Load data incrementally as the user interacts with the visualization. Data is fetched and rendered only as needed.
   - **Implementation**: Create a web-based dashboard with libraries like Dash or Streamlit, which support data fetching in parts.
   - **Pros**: Efficiently manages large datasets without overwhelming memory or CPU resources.
   - **Cons**: Requires interactive frameworks and can be more complex to implement.

### 12. **Cluster Sampling and Visualization**
   - **Concept**: Use clustering algorithms (e.g., KMeans) to group similar data points and visualize these clusters instead of individual points.
   - **Implementation**: Use scikit-learn for clustering, then visualize with your preferred library.
   - **Pros**: Reduces the dataset to manageable clusters while showing overall trends.
   - **Cons**: Requires additional data preprocessing.

### 13. **Parallel Processing for Rendering**
   - **Concept**: Use parallel processing to split the workload of plotting across multiple CPU cores.
   - **Implementation**: Utilize libraries like Dask or `multiprocessing` to process data in parallel.
   - **Pros**: Handles very large datasets more efficiently.
   - **Cons**: Increased complexity and requires multi-core support.

### 14. **Streaming Visualization**
   - **Concept**: Stream data incrementally into your visualization so that it updates in real-time as new data arrives.
   - **Implementation**: Use frameworks like Dash, Bokeh with streaming support, or Plotly’s `extendData`.
   - **Pros**: Ideal for real-time data visualization.
   - **Cons**: Requires an environment that supports real-time data feeds.

### 15. **Data Reduction Techniques**
   - **Concept**: Use data reduction methods like Principal Component Analysis (PCA) to reduce the dimensionality or complexity of the data.
   - **Implementation**: Use scikit-learn’s PCA or t-SNE for dimensionality reduction, then visualize.
   - **Pros**: Retains key information while reducing data size.
   - **Cons**: Can be complex to interpret reduced dimensions.

### Summary Table

| Technique                  | Suitable For                   | Pros                                   | Cons                                   |
|----------------------------|--------------------------------|---------------------------------------|---------------------------------------|
| Pagination                 | Very large datasets            | Easy to navigate small chunks         | Not suitable for overall trends       |
| Zooming & Panning          | All data sizes                 | Interactive and detailed              | Requires user interaction             |
| Filtering                  | Data with many dimensions      | Focus on specific data                | Interaction required                  |
| Aggregation/Binning        | High-volume data points        | Simplifies data, shows trends         | Loss of detail                        |
| Sampling                   | Extremely large datasets       | Reduces rendering time                | May miss important points             |
| Chunk Processing           | Out-of-memory datasets         | Handles large data                    | Increased complexity                  |
| Hierarchical Visualization | Multi-level data               | Different granular views              | Configuration required                |
| Interactive Libraries      | Real-time & large datasets     | High interactivity                    | Learning curve                        |
| Heatmaps/Density Plots     | Large datasets                 | Shows density patterns                | Loss of point-level detail            |
| Alpha Blending             | Overlapping data               | Reveals density naturally             | Can be unclear in dense regions       |
| Lazy Loading               | Large datasets                 | Efficient data handling               | Requires interactive setup            |
| Clustering                 | Data with natural groups       | Shows trends with reduced data        | Needs preprocessing                   |
| Parallel Processing        | Extremely large data           | Faster processing                     | More complex setup                    |
| Streaming Visualization    | Real-time data                 | Up-to-date insights                   | Complex to implement                  |
| Data Reduction             | High-dimensional data          | Simplifies data visualization         | Can lose interpretability             |

### Choosing the Right Approach
- For **overview** insights: Aggregation, binning, and density plots work best.
- For **detailed analysis**: Zooming/panning, pagination, and filtering are ideal.
- For **interactive dashboards**: Use libraries like Plotly, Dash, or Bokeh combined with filtering and lazy loading.
