# Data Exploration & Visualization

## Impoting The Dataset

In [None]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

iris_data = load_iris()
iris_data = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
iris_data.head(10)

: 

## Quantitative Checks

In [2]:
print("Dataset Shape:", iris_data.shape)
print("_______________________________________________________________")
print("\nColumn Names:", iris_data.columns)
print("_______________________________________________________________")
print("\nData Types:\n", iris_data.dtypes)
print("_______________________________________________________________")
print("\nBasic Statistical Summary:\n", iris_data.describe())
print("_______________________________________________________________")
print(iris_data.isnull().sum())

Dataset Shape: (150, 4)
_______________________________________________________________

Column Names: Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
_______________________________________________________________

Data Types:
 sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
dtype: object
_______________________________________________________________

Basic Statistical Summary:
        sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.4000

## Summary Statistics Techniques

In [4]:
print("Frequency Distributions:")
print(iris_data.round(1).apply(lambda x: x.value_counts()).fillna(0))
print("_______________________________________________________________")

# Mode calculation
print("Mode of Each Column:")
print(iris_data.mode().iloc[0])
print("_______________________________________________________________")

# Percentile computations (25th, 50th, 75th)
percentiles = [25, 50, 75]
print("Percentile Computations:")
print(iris_data.quantile(q=np.array(percentiles) / 100))
print("_______________________________________________________________")

# Mean and median calculations
print("Mean Values:")
print(iris_data.mean())
print("\nMedian Values:")
print(iris_data.median())
print("_______________________________________________________________")

# Range (Max - Min)
print("Range of Each Column:")
print(iris_data.max() - iris_data.min())
print("_______________________________________________________________")

# Variance measurements
print("Variance of Each Column:")
print(iris_data.var())

Frequency Distributions:
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0.1                0.0               0.0                0.0               5.0
0.2                0.0               0.0                0.0              29.0
0.3                0.0               0.0                0.0               7.0
0.4                0.0               0.0                0.0               7.0
0.5                0.0               0.0                0.0               1.0
..                 ...               ...                ...               ...
7.3                1.0               0.0                0.0               0.0
7.4                1.0               0.0                0.0               0.0
7.6                1.0               0.0                0.0               0.0
7.7                4.0               0.0                0.0               0.0
7.9                1.0               0.0                0.0               0.0

[74 rows x 4 columns]
________________

# Visualization

## Pie Chart

In [None]:
iris_data["species"] = [iris.target_names[i] for i in iris.target]  # Add species labels

# Count occurrences of each species
species_counts = iris_data["species"].value_counts()

# Create a pie chart
plt.figure(figsize=(8, 6))
plt.pie(species_counts, labels=species_counts.index, autopct="%1.1f%%", colors=["lightblue", "lightgreen", "lightcoral"])

plt.title("Distribution of Iris Species")
plt.show()

## Bar Chart

In [None]:
mean_values = iris_data.drop(columns=["species"]).mean() 

# Create a bar chart
plt.figure(figsize=(8, 5))
plt.bar(mean_values.index, mean_values.values, color=['blue', 'green', 'red', 'purple'])

# Add labels and title
plt.xlabel("Features")
plt.ylabel("Mean Value")
plt.title("Mean Values of Iris Dataset Features")
plt.xticks(rotation=45)  # Rotate labels for better readability

# Show the plot
plt.show()

## Histograms

In [None]:
# Create histograms for each feature
iris_data.hist(figsize=(10, 6), bins=20, edgecolor='black')

# Add title
plt.suptitle("Histograms of Iris Dataset Features", fontsize=16)

# Show the plot
plt.show()

## Scatter Plots

In [None]:
iris_data["species"] = [iris.target_names[i] for i in iris.target]  # Add species labels

# Select two features to plot (e.g., sepal length vs petal length)
x_feature = "sepal length (cm)"
y_feature = "petal length (cm)"

# Create scatter plot
plt.figure(figsize=(8, 6))
for species in iris_data["species"].unique():
    subset = iris_data[iris_data["species"] == species]
    plt.scatter(subset[x_feature], subset[y_feature], label=species)

# Add labels and title
plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.title(f"Scatter Plot of {x_feature} vs {y_feature}")
plt.legend()
plt.show()

## Box Plot

In [None]:
import seaborn as sns


iris_data["species"] = [iris.target_names[i] for i in iris.target]  # Add species labels

# Create a box plot for all numerical features grouped by species
plt.figure(figsize=(8, 6))
sns.boxplot(data=iris_data, x="species", y="sepal length (cm)")
plt.title("Box Plot of Sepal Length by Species")
plt.xlabel("Species")
plt.ylabel("Sepal Length (cm)")
plt.show()
