In [None]:
# Sample dataset
data = [
    (1, "Product A", "Category 1", 100, 50),
    (2, "Product B", "Category 1", 200, 80),
    (3, "Product C", "Category 2", 150, 60),
    (4, "Product D", "Category 2", 300, 90),
    (5, "Product E", "Category 3", 250, 70)
]

# Creating a PySpark DataFrame
df = spark.createDataFrame(data, ["ID", "Product", "Category", "Price", "Quantity"])
df.show()


In [None]:
Bar Chart:

In [None]:
import matplotlib.pyplot as plt

# Calculating the total sales for each product
sales_by_product = df.groupBy("Product").sum("Price").toPandas()

# Plotting a bar chart of total sales by product
plt.bar(sales_by_product["Product"], sales_by_product["sum(Price)"])
plt.xlabel("Product")
plt.ylabel("Total Sales")
plt.title("Bar Chart: Total Sales by Product")
plt.xticks(rotation=45)
plt.show()


In [None]:
Pie Chart:

In [None]:
import matplotlib.pyplot as plt

# Calculating the total sales for each category
sales_by_category = df.groupBy("Category").sum("Price").toPandas()

# Plotting a pie chart of total sales by category
plt.pie(sales_by_category["sum(Price)"], labels=sales_by_category["Category"], autopct='%1.1f%%')
plt.title("Pie Chart: Total Sales by Category")
plt.show()


In [None]:
Box Plot:

In [None]:
import seaborn as sns

# Creating a box plot of price by category
sns.boxplot(data=df.toPandas(), x="Category", y="Price")
plt.xlabel("Category")
plt.ylabel("Price")
plt.title("Box Plot: Price by Category")
plt.show()


In [None]:
Scatter Plot:

In [None]:
import matplotlib.pyplot as plt

# Plotting a scatter plot of price versus quantity
price_values = df.select("Price").rdd.flatMap(lambda x: x).collect()
quantity_values = df.select("Quantity").rdd.flatMap(lambda x: x).collect()

plt.scatter(price_values, quantity_values)
plt.xlabel("Price")
plt.ylabel("Quantity")
plt.title("Scatter Plot: Price vs Quantity")
plt.show()


In [None]:
Line Plot:

In [None]:
import matplotlib.pyplot as plt

# Sorting the DataFrame by ID
sorted_df = df.sort("ID")

# Line plot of total sales by ID
id_values = sorted_df.select("ID").rdd.flatMap(lambda x: x).collect()
sales_values = sorted_df.select("Price").rdd.flatMap(lambda x: x).collect()

plt.plot(id_values, sales_values)
plt.xlabel("ID")
plt.ylabel("Total Sales")
plt.title("Line Plot: Total Sales by ID")
plt.show()


In [None]:
# Area chart

import matplotlib.pyplot as plt

# Calculating the total sales for each product
sales_by_product = df.groupBy("Product").sum("Price").toPandas()

# Plotting an area chart of total sales by product
plt.stackplot(sales_by_product["Product"], sales_by_product["sum(Price)"], labels=sales_by_product["Product"])
plt.xlabel("Product")
plt.ylabel("Total Sales")
plt.title("Area Chart: Total Sales by Product")
plt.legend(loc="upper left")
plt.xticks(rotation=45)
plt.show()


In [None]:
Histogram:

In [None]:
import matplotlib.pyplot as plt

# Plotting a histogram of prices
price_values = df.select("Price").rdd.flatMap(lambda x: x).collect()

plt.hist(price_values, bins=10)
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.title("Histogram: Price Distribution")
plt.show()


In [None]:
Heatmap

In [None]:
import seaborn as sns

# Calculating the correlation matrix
correlation_matrix = df.select(df.columns).toPandas().corr()

# Plotting a heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Heatmap: Correlation Matrix")
plt.show()


In [None]:
Violin Plot:

In [None]:
import seaborn as sns

# Creating a violin plot of price by category
sns.violinplot(data=df.toPandas(), x="Category", y="Price")
plt.xlabel("Category")
plt.ylabel("Price")
plt.title("Violin Plot: Price by Category")
plt.show()


In [None]:
Radar Chart

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Aggregating the average price and quantity by category
avg_price_quantity = df.groupBy("Category").avg("Price", "Quantity").toPandas()

# Creating data for the radar chart
categories = avg_price_quantity["Category"]
price_values = avg_price_quantity["avg(Price)"]
quantity_values = avg_price_quantity["avg(Quantity)"]

# Plotting a radar chart of average price and quantity by category
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1]
ax.plot(angles, price_values.tolist() + price_values.tolist()[:1], label="Average Price")
ax.plot(angles, quantity_values.tolist() + quantity_values.tolist()[:1], label="Average Quantity")
ax.fill(angles, price_values.tolist() + price_values.tolist()[:1], alpha=0.25)
ax.fill(angles, quantity_values.tolist() + quantity_values.tolist()[:1], alpha=0.25)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
plt.title("Radar Chart: Average Price and Quantity by Category")
plt.legend(loc="upper right")
plt.show()


In [None]:
Stacked Bar Chart:

In [None]:
import matplotlib.pyplot as plt

# Calculating the total sales for each category and product
sales_by_category_product = df.groupBy("Category", "Product").sum("Price").toPandas()

# Pivot the data for stacked bar chart
pivot_df = sales_by_category_product.pivot(index="Category", columns="Product", values="sum(Price)")

# Plotting a stacked bar chart
pivot_df.plot(kind="bar", stacked=True)
plt.xlabel("Category")
plt.ylabel("Total Sales")
plt.title("Stacked Bar Chart: Total Sales by Category and Product")
plt.xticks(rotation=45)
plt.show()


In [None]:
Scatter Matrix Plot:

In [None]:
import pandas as pd
import seaborn as sns

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.select("Price", "Quantity").toPandas()

# Plotting a scatter matrix plot
sns.pairplot(pandas_df)
plt.title("Scatter Matrix Plot: Price and Quantity")
plt.show()


In [None]:
Word Cloud:

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Concatenating all product names
all_products = " ".join(df.select("Product").rdd.flatMap(lambda x: x).collect())

# Generating a word cloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_products)

# Displaying the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud: Product Names")
plt.show()


In [None]:
Treemap:

In [None]:
import squarify
import matplotlib.pyplot as plt

# Calculating the total sales for each product
sales_by_product = df.groupBy("Product").sum("Price").toPandas()

# Sorting the data by total sales
sales_by_product.sort_values("sum(Price)", ascending=False, inplace=True)

# Creating a treemap
squarify.plot(sizes=sales_by_product["sum(Price)"], label=sales_by_product["Product"], alpha=0.8)
plt.axis("off")
plt.title("Treemap: Total Sales by Product")
plt.show()


In [None]:
3D Scatter Plot:

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Extracting the X, Y, and Z coordinates
x_values = df.select("ID").rdd.flatMap(lambda x: x).collect()
y_values = df.select("Price").rdd.flatMap(lambda x: x).collect()
z_values = df.select("Quantity").rdd.flatMap(lambda x: x).collect()

# Creating a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x_values, y_values, z_values)
ax.set_xlabel("ID")
ax.set_ylabel("Price")
ax.set_zlabel("Quantity")
ax.set_title("3D Scatter Plot: Price vs Quantity vs ID")
plt.show()


In [None]:
Parallel Coordinates Plot:

In [None]:
import pandas as pd
from pandas.plotting import parallel_coordinates

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.select("Product", "Price", "Quantity").toPandas()

# Plotting a parallel coordinates plot
plt.figure(figsize=(8, 5))
parallel_coordinates(pandas_df, "Product", colormap=plt.get_cmap("Set1"))
plt.xlabel("Features")
plt.ylabel("Values")
plt.title("Parallel Coordinates Plot: Product, Price, Quantity")
plt.xticks(rotation=45)
plt.legend(loc="upper right")
plt.show()


In [None]:
Bubble Chart:

In [None]:
import matplotlib.pyplot as plt

# Extracting the X, Y, and size values
x_values = df.select("ID").rdd.flatMap(lambda x: x).collect()
y_values = df.select("Price").rdd.flatMap(lambda x: x).collect()
size_values = df.select("Quantity").rdd.flatMap(lambda x: x).collect()

# Plotting a bubble chart
plt.scatter(x_values, y_values, s=size_values, alpha=0.5)
plt.xlabel("ID")
plt.ylabel("Price")
plt.title("Bubble Chart: Price vs ID with Quantity as Size")
plt.show()


In [None]:
Polar Chart:

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Aggregating the total sales by category
sales_by_category = df.groupBy("Category").sum("Price").toPandas()

# Creating data for the polar chart
categories = sales_by_category["Category"]
sales_values = sales_by_category["sum(Price)"]

# Plotting a polar chart
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
sales_values += sales_values[:1]
angles += angles[:1]
plt.polar(angles, sales_values)
plt.xticks(angles[:-1], categories)
plt.title("Polar Chart: Total Sales by Category")
plt.show()


In [None]:
Donut Chart:

In [None]:
import matplotlib.pyplot as plt

# Calculating the total sales for each category
sales_by_category = df.groupBy("Category").sum("Price").toPandas()

# Plotting a donut chart of total sales by category
plt.pie(sales_by_category["sum(Price)"], labels=sales_by_category["Category"], autopct='%1.1f%%', wedgeprops=dict(width=0.4))
plt.title("Donut Chart: Total Sales by Category")
plt.show()


In [None]:
Streamgraph:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.groupBy("Category").sum("Price").toPandas()

# Sorting the data by category
pandas_df.sort_values("Category", inplace=True)

# Creating a streamgraph
plt.stackplot(pandas_df["Category"], pandas_df.drop("Category", axis=1).T, labels=pandas_df.drop("Category", axis=1).columns)
plt.xlabel("Category")
plt.ylabel("Total Sales")
plt.title("Streamgraph: Total Sales by Category")
plt.legend(loc="upper left")
plt.xticks(rotation=45)
plt.show()


In [None]:
visually interactive Charts

In [None]:
Interactive Line Chart:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.select("ID", "Price").toPandas()

# Creating an interactive line chart
fig = px.line(pandas_df, x="ID", y="Price", title="Interactive Line Chart: Price by ID")
fig.show()


In [None]:
Interactive Scatter Plot with Tooltips:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating an interactive scatter plot with tooltips
fig = px.scatter(pandas_df, x="Price", y="Quantity", hover_data=["Product"], title="Interactive Scatter Plot with Tooltips")
fig.show()


In [None]:
Interactive Bar Chart with Dropdown:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating an interactive bar chart with dropdown
fig = px.bar(pandas_df, x="Product", y="Price", color="Category", title="Interactive Bar Chart with Dropdown")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()


In [None]:
Interactive Sunburst Chart:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating an interactive sunburst chart
fig = px.sunburst(pandas_df, path=["Category", "Product"], values="Price", title="Interactive Sunburst Chart: Price by Category and Product")
fig.show()


In [None]:
Interactive Heatmap with Slider:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating an interactive heatmap with slider
fig = px.imshow(pandas_df.pivot(index="Category", columns="Product", values="Price"), x=pandas_df["Product"], y=pandas_df["Category"], title="Interactive Heatmap with Slider: Price by Category and Product")
fig.update_layout(xaxis={'categoryorder':'total descending'}, yaxis={'categoryorder':'total ascending'})
fig.update_xaxes(side="top")
fig.update_traces(colorbar=dict(len=0.4, y=0.8))
fig.show()


In [None]:
Interactive Pie Chart:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating an interactive pie chart
fig = px.pie(pandas_df, names="Category", title="Interactive Pie Chart: Distribution by Category")
fig.show()


In [None]:
Choropleth Map:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating a choropleth map
fig = px.choropleth(pandas_df, locations="Country", color="Price", title="Choropleth Map: Price by Country")
fig.show()


In [None]:
Box Plot:

In [None]:
import plotly.express as px

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating a box plot
fig = px.box(pandas_df, x="Category", y="Price", title="Box Plot: Price Distribution by Category")
fig.show()


In [None]:
Tree Diagram:

In [None]:
import plotly.graph_objects as go

# Creating a tree diagram
fig = go.Figure(go.Treemap(
    labels=df.select("Product").rdd.flatMap(lambda x: x).collect(),
    parents=[''] * len(df.select("Product").rdd.flatMap(lambda x: x).collect()),
    values=df.select("Price").rdd.flatMap(lambda x: x).collect(),
))
fig.update_layout(title="Tree Diagram: Product Hierarchy")
fig.show()


In [None]:
 visually appealing charts with dynamic options

In [None]:
import squarify

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Calculating the total sales by category
sales_by_category = pandas_df.groupby("Category")["Price"].sum().reset_index()

# Creating a treemap
plt.figure(figsize=(8, 6))
squarify.plot(sizes=sales_by_category["Price"], label=sales_by_category["Category"], alpha=0.8)
plt.title("Treemap: Total Sales by Category")
plt.axis("off")
plt.show()


In [None]:
Network Graph:

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating a network graph
G = nx.from_pandas_edgelist(pandas_df, source="Product", target="Category")
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=0.3)
nx.draw_networkx(G, pos, with_labels=True, node_color="skyblue", node_size=800, edge_color="gray", linewidths=0.5, font_size=10)
plt.title("Network Graph: Product-Category Relationships")
plt.show()


In [None]:
Animated Bar Chart:

In [None]:
import matplotlib.animation as animation

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Sorting the data by price in descending order
sorted_df = pandas_df.sort_values("Price", ascending=False)

# Creating an animated bar chart
fig, ax = plt.subplots(figsize=(8, 6))

def animate(i):
    ax.clear()
    ax.bar(sorted_df["Product"].iloc[:i], sorted_df["Price"].iloc[:i], color="skyblue")
    plt.xlabel("Product")
    plt.ylabel("Price")
    plt.title("Animated Bar Chart: Top Products by Price")
    plt.xticks(rotation=45, ha="right")

ani = animation.FuncAnimation(fig, animate, frames=len(sorted_df)+1, interval=200, blit=False)
plt.show()


In [None]:
3D Scatter Plot:

In [None]:
import plotly.graph_objects as go

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=pandas_df["Product"],
    y=pandas_df["Category"],
    z=pandas_df["Price"],
    mode="markers",
    marker=dict(
        size=8,
        color=pandas_df["Quantity"],
        colorscale="Viridis",
        opacity=0.8
    )
)])

fig.update_layout(scene=dict(
    xaxis_title="Product",
    yaxis_title="Category",
    zaxis_title="Price"
))

fig.show()


In [None]:
Interactive Word Cloud:

In [None]:
from wordcloud import WordCloud

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Generating word frequencies
word_frequencies = pandas_df["Product"].value_counts()

# Creating an interactive word cloud
wordcloud = WordCloud(width=800, height=400, colormap="Blues").generate_from_frequencies(word_frequencies)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Interactive Word Cloud: Product Frequencies")
plt.show()


In [None]:
Animated Scatter Plot:

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.select("Price", "Quantity").toPandas()

# Creating an animated scatter plot
fig = go.Figure()

for i in range(len(pandas_df)):
    fig.add_trace(go.Scatter(
        x=[pandas_df["Price"][i]],
        y=[pandas_df["Quantity"][i]],
        mode='markers',
        marker=dict(
            size=10,
            color=pandas_df["Price"][i],
            colorscale='Viridis',
            showscale=True
        ),
        showlegend=False
    ))
    fig.update_layout(title_text='Animated Scatter Plot: Price vs Quantity', xaxis_title="Price", yaxis_title="Quantity")

fig.update_layout(
    updatemenus=[dict(
        type="buttons",
        buttons=[
            dict(label="Play",
                 method="animate",
                 args=[None, {"frame": {"duration": 500, "redraw": True},
                              "fromcurrent": True}]),
            dict(label="Pause",
                 method="animate",
                 args=[[None], {"frame": {"duration": 0, "redraw": False},
                                "mode": "immediate",
                                "fromcurrent": True}])
        ],
        active=0,
        showactive=False,
        x=0.1,
        y=0,
        xanchor="left",
        yanchor="bottom"
    )]
)

fig.show()


In [None]:
Sunburst Chart with Animation:

In [None]:
import plotly.graph_objects as go

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating a sunburst chart with animation
fig = go.Figure()

for i in range(len(pandas_df)):
    fig.add_trace(go.Sunburst(
        labels=pandas_df["Product"][:i+1],
        parents=pandas_df["Category"][:i+1],
        values=pandas_df["Price"][:i+1],
        branchvalues='total',
    ))

fig.update_layout(title="Animated Sunburst Chart: Price by Category and Product")

fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=[{"visible": [True] * len(pandas_df)}],
                    label="Play",
                    method="update"
                ),
                dict(
                    args=[{"visible": [False] * len(pandas_df)}],
                    label="Pause",
                    method="update"
                )
            ]),
            type="buttons",
            direction="left",
            x=0.1,
            y=1.1
        )
    ]
)

fig.show()


In [None]:
Animated Network Graph:

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import random
import time

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Creating an animated network graph
G = nx.Graph()

for index, row in pandas_df.iterrows():
    G.add_edge(row["Category"], row["Product"])

pos = nx.spring_layout(G, k=0.15, seed=42)

plt.figure(figsize=(12, 8))

for _ in range(30):
    random.seed(42)
    nx.draw(G, pos, with_labels=True, node_size=800, node_color=[random.random() for _ in range(len(G.nodes))])
    plt.title("Animated Network Graph: Product-Category Relationships")
    plt.pause(0.2)
    plt.clf()

plt.show()


In [None]:
Animated Radial Bar Chart:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation

# Converting the PySpark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Aggregating data by category
category_df = pandas_df.groupby("Category").sum().reset_index()

# Sorting the data by price in descending order
sorted_df = category_df.sort_values("Price", ascending=False)

# Creating an animated radial bar chart
fig, ax = plt.subplots(figsize=(8, 8))
bars = ax.bar(sorted_df["Category"], sorted_df["Price"], color="skyblue")

def update(frame):
    for i, bar in enumerate(bars):
        bar.set_height(sorted_df["Price"].iloc[i] / (frame + 1))

ani = animation.FuncAnimation(fig, update, frames=sorted_df.shape[0], interval=500, blit=False)
plt.xlabel("Category")
plt.ylabel("Price")
plt.title("Animated Radial Bar Chart: Price by Category")
plt.show()
