<a href="https://colab.research.google.com/github/pintukumargithub/Weak-2-ML/blob/main/Multivariate_Gaussian_Distribution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Univariate Gaussian Distribution of 1 feature from Iris dataset

In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from scipy.stats import norm

# Load the Iris dataset from CSV
df = pd.read_csv('Iris.csv')
print(df.columns)

# Select two features and the first 20 rows
X = df[['SepalLengthCm', 'SepalWidthCm']].iloc[:20].values
feature_names = ['SepalLengthCm', 'SepalWidthCm']

# Loop through each feature individually
for i in range(2):
    feature_data = X[:, i]
    feature_label = feature_names[i]
    mu = np.mean(feature_data)
    sigma = np.std(feature_data)

    print("\nFor", feature_label, ":", feature_data)
    print("Mean:", mu)
    print("Standard Deviation:", sigma)

    # Generate Gaussian curve
    x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 200)
    y = norm.pdf(x, mu, sigma)

    print("\nType of x", type(x), x.shape)
    print("Type of y", type(y), y.shape)

    # Create figure
    fig = go.Figure()

    # Add Gaussian curve
    fig.add_trace(go.Scatter(
        x=x,
        y=y,
        mode='lines',
        name=f'{feature_label} Gaussian',
        line=dict(color='blue', width=2)
    ))

    # Add vertical lines for each data point
    for val in feature_data:
        fig.add_trace(go.Scatter(
            x=[val, val],
            y=[0, norm.pdf(val, mu, sigma)],
            mode='lines',
            line=dict(color='red', width=1),
            showlegend=False
        ))

    # Add red dot markers on the curve
    fig.add_trace(go.Scatter(
        x=feature_data,
        y=norm.pdf(feature_data, mu, sigma),
        mode='markers',
        marker=dict(color='red', size=6),
        name='Data Points'
    ))

    # Layout for each plot
    fig.update_layout(
        title=f'1D Gaussian Fit for {feature_label}',
        xaxis_title=feature_label,
        yaxis_title='Probability Density',
        template='plotly_white',
        height=400
    )

    # Show the plot
    fig.show()


Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

For SepalLengthCm : [5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
 5.7 5.1]
Mean: 5.034999999999999
Standard Deviation: 0.4162631379308046

Type of x <class 'numpy.ndarray'> (200,)
Type of y <class 'numpy.ndarray'> (200,)



For SepalWidthCm : [3.5 3.  3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 3.7 3.4 3.  3.  4.  4.4 3.9 3.5
 3.8 3.8]
Mean: 3.4799999999999995
Standard Deviation: 0.3969886648255842

Type of x <class 'numpy.ndarray'> (200,)
Type of y <class 'numpy.ndarray'> (200,)


# Multivariate Gaussian Distribution of 2 features from Iris dataset

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from scipy.stats import norm, multivariate_normal


df = pd.read_csv('Iris.csv')
print(df.columns)


X = df[['SepalLengthCm', 'SepalWidthCm']].iloc[:20].values
feature_1 = X[:, 0]
feature_2 = X[:, 1]

fig_1d = go.Figure()

mean = np.mean(X, axis=0)
cov = np.cov(X, rowvar=False)
rv = multivariate_normal(mean, cov)

print("\nFor Multivariate distribution ")
print("SepalLengthCm", feature_1)
print("SepalWidthCm", feature_2)
print("Mean:", mean)
print("Covariance matrix:\n", cov)

x = np.linspace(feature_1.min() - 0.5, feature_1.max() + 0.5, 100)
y = np.linspace(feature_2.min() - 0.5, feature_2.max() + 0.5, 100)
X_grid, Y_grid = np.meshgrid(x, y)
Z = rv.pdf(np.dstack((X_grid, Y_grid)))

print("\nType of X_grid", type(X_grid), X_grid.shape)
print("Type of X_grid", type(Y_grid), Y_grid.shape)
print("Type of Z", type(Z), Z.shape)

# Surface
surface = go.Surface(
    x=X_grid,
    y=Y_grid,
    z=Z,
    colorscale='Viridis',
    opacity=0.85,
    name='Gaussian Surface'
)

# Real data points in 3D
scatter = go.Scatter3d(
    x=feature_1,
    y=feature_2,
    z=rv.pdf(X),
    mode='markers',
    marker=dict(size=5, color='red'),
    name='Data Points'
)

layout = go.Layout(
    title='3D Multivariate Gaussian Fit on Iris CSV Data',
    scene=dict(
        xaxis_title='SepalLengthCm',
        yaxis_title='SepalWidthCm',
        zaxis_title='Probability Density',
    ),
    height=600,
    margin=dict(l=0, r=0, t=50, b=0)
)

fig_3d = go.Figure(data=[surface, scatter], layout=layout)
fig_3d.show()

# Multivariate Gaussian Distribution of 3 features from Iris dataset

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from scipy.stats import multivariate_normal

# Load data
df = pd.read_csv('Iris.csv')
print(df.columns)

# Use 3 features and first 20 rows
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm']].iloc[:20].values
feature_1 = X[:, 0]
feature_2 = X[:, 1]
feature_3 = X[:, 2]

# Compute mean and covariance
mean = np.mean(X, axis=0)
cov = np.cov(X, rowvar=False)
rv = multivariate_normal(mean, cov)

# === 1. Scatter plot of actual data points with PDF values ===
pdf_values = rv.pdf(X)

print("\nFor 3D Multivariate Gaussian distribution:")
print("SepalLengthCm:", feature_1)
print("SepalWidthCm:", feature_2)
print("PetalLengthCm:", feature_3)
print("Mean:", mean)
print("Covariance matrix:\n", cov)
print("\nType of feature_1", type(feature_1), feature_1.shape)
print("Type of feature_2", type(feature_2), feature_2.shape)
print("Type of feature_3", type(feature_3), feature_3.shape)
print("Type of pdf_values", type(pdf_values), pdf_values.shape)

scatter = go.Scatter3d(
    x=feature_1,
    y=feature_2,
    z=feature_3,
    mode='markers',
    marker=dict(
        size=6,
        color=pdf_values,
        colorscale='Viridis',
        colorbar=dict(title='PDF'),
        opacity=0.9
    ),
    name='Data Points'
)

layout_scatter = go.Layout(
    title='3D Scatter Colored by Gaussian PDF (Sepal & Petal Length/Width)',
    scene=dict(
        xaxis_title='SepalLengthCm',
        yaxis_title='SepalWidthCm',
        zaxis_title='PetalLengthCm',
    ),
    height=700
)

fig_scatter = go.Figure(data=[scatter], layout=layout_scatter)
fig_scatter.show()

# === 2. Surface plot of Gaussian PDF at fixed PetalLength ===
# Fix third feature (PetalLengthCm) at its mean
fixed_petal_length = mean[2]

# Create grid for SepalLengthCm and SepalWidthCm
x = np.linspace(feature_1.min() - 0.5, feature_1.max() + 0.5, 100)
y = np.linspace(feature_2.min() - 0.5, feature_2.max() + 0.5, 100)
X_grid, Y_grid = np.meshgrid(x, y)

# Flatten and create input for PDF: shape (10000, 3)
grid_points = np.column_stack([
    X_grid.ravel(),    # SepalLength
    Y_grid.ravel(),    # SepalWidth
    np.full(X_grid.size, fixed_petal_length)  # Fixed PetalLength
])

Z = rv.pdf(grid_points).reshape(X_grid.shape)

print("\nType of X_grid", type(X_grid), X_grid.shape)
print("Type of Y_grid", type(Y_grid), Y_grid.shape)
print("Type of Z", type(Z), Z.shape)

# Create surface plot
surface = go.Surface(
    x=X_grid,
    y=Y_grid,
    z=Z,
    colorscale='Viridis',
    showscale=True,
    colorbar=dict(title='PDF'),
    name='Gaussian Surface'
)

layout_surface = go.Layout(
    title=f'Gaussian PDF Surface at PetalLengthCm = {fixed_petal_length:.2f}',
    scene=dict(
        xaxis_title='SepalLengthCm',
        yaxis_title='SepalWidthCm',
        zaxis_title='PDF Value',
    ),
    height=700
)

fig_surface = go.Figure(data=[surface], layout=layout_surface)
fig_surface.show()


# Multivariate Gaussian Distribution of 3 features from Iris dataset

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from scipy.stats import multivariate_normal

# Load data
df = pd.read_csv('Iris.csv')
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm']].iloc[:20].values

# Compute mean and covariance
mean = np.mean(X, axis=0)
cov = np.cov(X, rowvar=False)
rv = multivariate_normal(mean, cov)

# Generate a 3D grid over the full range
x = np.linspace(X[:, 0].min() - 0.5, X[:, 0].max() + 0.5, 30)
y = np.linspace(X[:, 1].min() - 0.5, X[:, 1].max() + 0.5, 30)
z = np.linspace(X[:, 2].min() - 0.5, X[:, 2].max() + 0.5, 30)

X_grid, Y_grid, Z_grid = np.meshgrid(x, y, z)
grid_points = np.column_stack((X_grid.ravel(), Y_grid.ravel(), Z_grid.ravel()))

# Compute PDF values for all grid points
pdf_values = rv.pdf(grid_points).reshape(X_grid.shape)

print("\nFor 3D Multivariate Gaussian distribution:")
print("Data shape:", X.shape)
print("Mean:", mean)
print("Covariance matrix:\n", cov)
print("\nType of X_grid", X_grid.shape)
print("Type of Y_grid", Y_grid.shape)
print("Type of Z_grid", Z_grid.shape)
print("Type of pdf_values", type(pdf_values), pdf_values.shape)

# Create 3D isosurface plot
isosurface = go.Isosurface(
    x=X_grid.flatten(),
    y=Y_grid.flatten(),
    z=Z_grid.flatten(),
    value=pdf_values.flatten(),
    isomin=np.percentile(pdf_values, 5),
    isomax=pdf_values.max(),
    surface_count=3,
    colorscale='Viridis', # 'Viridis' is a perceptually uniform colormap, going from dark purple (low) to yellow-green (high).
    caps=dict(x_show=False, y_show=False, z_show=False),
    colorbar=dict(title='PDF'),
)

layout = go.Layout(
    title='3D Multivariate Gaussian Isosurface (Full Distribution)',
    scene=dict(
        xaxis_title='SepalLengthCm',
        yaxis_title='SepalWidthCm',
        zaxis_title='PetalLengthCm',
    ),
    height=700
)

fig = go.Figure(data=[isosurface], layout=layout)
fig.show()


# Comparison of two Multidimensional Gaussian Distribution with dummy parameters

In [None]:
import numpy as np
import plotly.graph_objs as go
from scipy.stats import multivariate_normal

# -------------------------------
# Mean and Covariance for two Gaussians
mean1 = np.array([5.0, 3.5, 1.5])
cov1 = np.array([
    [0.5, 0.0, 0.0],
    [0.0, 0.5, 0.0],
    [0.0, 0.0, 0.5]
])

mean2 = np.array([5.0, 3.5, 1.5])
cov2 = np.array([
    [0.8, 0.0, 0.0],
    [0.1, 0.8, 0.0],
    [0.0, 0.0, 0.8]
])
# -------------------------------
print("\nmean1 :", mean1)
print("cov1 :\n", cov1)

print("\nmean2 :", mean2)
print("cov2 :\n", cov2)

# Common grid that covers both blobs
x = np.linspace(3, 8, 40)
y = np.linspace(2, 5, 40)
z = np.linspace(1, 4, 40)
X, Y, Z = np.meshgrid(x, y, z)
grid_points = np.column_stack((X.ravel(), Y.ravel(), Z.ravel()))

# Evaluate PDFs
rv1 = multivariate_normal(mean1, cov1)
pdf1 = rv1.pdf(grid_points).reshape(X.shape)

rv2 = multivariate_normal(mean2, cov2)
pdf2 = rv2.pdf(grid_points).reshape(X.shape)

# Isosurface for Gaussian 1
isosurface1 = go.Isosurface(
    x=X.flatten(),
    y=Y.flatten(),
    z=Z.flatten(),
    value=pdf1.flatten(),
    isomin=np.percentile(pdf1, 5),
    isomax=pdf1.max(),
    surface_count=3,
    colorscale='Viridis',
    opacity=0.6,
    caps=dict(x_show=False, y_show=False, z_show=False),
    name='Gaussian 1'
)

# Isosurface for Gaussian 2
isosurface2 = go.Isosurface(
    x=X.flatten(),
    y=Y.flatten(),
    z=Z.flatten(),
    value=pdf2.flatten(),
    isomin=np.percentile(pdf2, 5),
    isomax=pdf2.max(),
    surface_count=3,
    colorscale='Plasma',
    opacity=0.6,
    caps=dict(x_show=False, y_show=False, z_show=False),
    name='Gaussian 2'
)

# Layout and figure
layout = go.Layout(
    title='Comparison of Two 3D Gaussian Distributions',
    scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z',
    ),
    height=700
)

fig = go.Figure(data=[isosurface1, isosurface2], layout=layout)
fig.show()
