In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# Fetch the Iris dataset from the UCI ML repository
iris = fetch_ucirepo(id=53)

# Load the data into pandas DataFrames
X = iris.data.features  # Feature data
y = iris.data.targets   # Target data

# Combine features and targets into a single DataFrame for analysis
data = pd.concat([X, y], axis=1)

# Display metadata and variable information
print("\nDataset Metadata:")
print(iris.metadata)

print("\nVariable Information:")
print(iris.variables)

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(data.head())

# Explore the structure of the dataset
print("\nDataset Info:")
print(data.info())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Clean the dataset by filling or dropping missing values
data_cleaned = data.fillna(data.mean())

# Verify that missing values are handled
print("\nMissing Values After Cleaning:")
print(data_cleaned.isnull().sum())

# Basic Data Analysis
print("\nBasic Statistics of Numerical Columns:")
print(data_cleaned.describe())

# Perform groupings on a categorical column and compute the mean of a numerical column
categorical_column = 'class'  # The categorical column in the Iris dataset
numerical_column = 'sepal length'  # A numerical column in the Iris dataset

if categorical_column in data_cleaned.columns and numerical_column in data_cleaned.columns:
    grouped_data = data_cleaned.groupby(categorical_column)[numerical_column].mean()
    print(f"\nMean of {numerical_column} grouped by {categorical_column}:")
    print(grouped_data)

# Visualizations
# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(data_cleaned.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Distribution of a specific column
if numerical_column in data_cleaned.columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(data_cleaned[numerical_column], kde=True)
    plt.title(f"Distribution of {numerical_column}")
    plt.show()

# Line chart (example: trends over time)
# Simulate a time column for demonstration
data_cleaned['time'] = range(len(data_cleaned))
plt.figure(figsize=(10, 6))
plt.plot(data_cleaned['time'], data_cleaned['sepal length'], label='Sepal Length', color='blue', linewidth=2)
plt.title("Simulated Time-Series of Sepal Length", fontsize=14)
plt.xlabel("Time", fontsize=12)
plt.ylabel("Sepal Length", fontsize=12)
plt.legend(fontsize=10)
plt.grid(alpha=0.3)
plt.show()

# Bar chart (comparison of a numerical value across categories)
plt.figure(figsize=(8, 5))
sns.barplot(x=categorical_column, y=numerical_column, data=data_cleaned, palette='viridis', edgecolor='black')
plt.title(f"Average {numerical_column} per {categorical_column}", fontsize=14)
plt.xlabel(categorical_column.capitalize(), fontsize=12)
plt.ylabel(f"Average {numerical_column.capitalize()}", fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', alpha=0.3)
plt.show()

# Histogram (distribution of a numerical column)
plt.figure(figsize=(8, 5))
sns.histplot(data_cleaned[numerical_column], kde=True, color='green', bins=15, edgecolor='black')
plt.title(f"Distribution of {numerical_column}", fontsize=14)
plt.xlabel(numerical_column.capitalize(), fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(alpha=0.3)
plt.show()

# Scatter plot (relationship between two numerical columns)
plt.figure(figsize=(8, 5))
sns.scatterplot(x='sepal length', y='petal length', hue=categorical_column, data=data_cleaned, palette='deep', s=100, edgecolor='black')
plt.title("Sepal Length vs. Petal Length", fontsize=14)
plt.xlabel("Sepal Length", fontsize=12)
plt.ylabel("Petal Length", fontsize=12)
plt.legend(title=categorical_column.capitalize(), fontsize=10, title_fontsize=12)
plt.grid(alpha=0.3)
plt.show()

# Findings and Observations
print("\nPatterns and Findings:")
print("1. Basic statistics reveal the central tendency and spread of numerical columns.")
if categorical_column in data_cleaned.columns and numerical_column in data_cleaned.columns:
    print(f"2. Grouping by {categorical_column} shows how {numerical_column} varies across different classes.")
    print("   For example, the mean sepal length differs significantly between the classes.")
print("3. Correlation heatmap shows relationships between numerical features.")

ModuleNotFoundError: No module named 'matplotlib'