Analyzing Data with Pandas and Visualizing Results with Matplotlib

This notebook demonstrates how to analyze a dataset using pandas and visualize insights using matplotlib and seaborn. We’ll use the Iris dataset in CSV format.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Apply seaborn styling
sns.set_style('whitegrid')

 Task 1: Load and Explore the Dataset
Load the Iris dataset from a CSV file and inspect its structure.

In [None]:
try:
    df = pd.read_csv('iris.csv')
    print("Dataset loaded successfully.")
    
    # First few rows
    display(df.head())
    
    # Data types
    print("\nData types:")
    print(df.dtypes)
    
    # Missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())
    
except FileNotFoundError:
    print("Error: Dataset file not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Task 2: Basic Data Analysis
We analyze basic statistics and perform group operations.

In [None]:
# Descriptive statistics
display(df.describe())

# Mean values per species
display(df.groupby('species').mean())

Task 3: Data Visualizations
We create various plots to understand the dataset better.

In [None]:
# Line chart
plt.figure(figsize=(10, 6))
plt.plot(df.index, df['sepal length (cm)'], label='Sepal Length', color='blue')
plt.title('Sepal Length Across Samples')
plt.xlabel('Sample Index')
plt.ylabel('Sepal Length (cm)')
plt.legend()
plt.show()

In [None]:
# Bar chart: Average petal length by species
mean_petal = df.groupby('species')['petal length (cm)'].mean()
plt.figure(figsize=(10, 6))
plt.bar(mean_petal.index, mean_petal.values, color=['green', 'blue', 'red'])
plt.title('Average Petal Length by Species')
plt.xlabel('Species')
plt.ylabel('Average Length (cm)')
plt.show()

In [None]:
# Histogram
plt.figure(figsize=(10, 6))
plt.hist(df['sepal length (cm)'], bins=15, color='purple', edgecolor='black')
plt.title('Distribution of Sepal Lengths')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter plot
plt.figure(figsize=(10, 6))
colors = {'setosa': 'red', 'versicolor': 'green', 'virginica': 'blue'}
plt.scatter(df['sepal length (cm)'], df['petal length (cm)'],
            c=df['species'].map(colors), alpha=0.7)
plt.title('Sepal vs. Petal Length')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal Length (cm)')
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10)
           for color in colors.values()]
plt.legend(handles, colors.keys(), title='Species')
plt.show()