In [None]:
import pandas as pd

In [None]:
# Task 1.
try:
    df = pd.read_csv('Iris.csv')  # Replace with your dataset path
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("The file was not found. Please check the file path.")
    exit()  # Exit if the file is not found


In [None]:
print("\nFirst few rows of the dataset:")
print(df.head())


In [None]:
print("\nData Types:")
print(df.dtypes)

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())


In [None]:
# Show missing values before filling
print("Missing values before filling:")
print(df.isnull().sum())

# Fill only numeric columns with their mean
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Show missing values after filling
print("\nMissing values after filling:")
print(df.isnull().sum())


In [None]:
# Cleaning the dataset: 
# Although there are no missing values in this dataset, 
# the following code is included to demonstrate how to handle them.
# It fills any missing numeric values with the mean of their respective columns.
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())


In [None]:
# Verifying that missing values have been handled
print("\nMissing values after filling:")
print(df.isnull().sum())


In [None]:
# Task 2

In [None]:
# Basic statistics of numerical columns
print("Basic statistical summary:")
print(df.describe())


In [None]:
# Grouping by species and calculating the mean of numeric columns
print("\nAverage measurements per species:")
print(df.groupby('Species').mean(numeric_only=True))


In [None]:
## Observations:
- Iris-setosa tends to have the smallest petal length and width.
- Iris-virginica generally has the largest values in most measurements.
- The features are distinct enough that we can potentially separate species based on these values.


In [None]:
# Task 3. Data visualizations
import matplotlib.pyplot as plt

# Line chart for Petal Length
plt.figure(figsize=(10, 5))
plt.plot(df.index, df['PetalLengthCm'], label='Petal Length (cm)', color='green')
plt.title("Petal Length Trend Over Index")
plt.xlabel("Index")
plt.ylabel("Petal Length (cm)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Bar chart comparing average petal length for each species
species_means = df.groupby('Species')['PetalLengthCm'].mean()

plt.figure(figsize=(8, 5))
species_means.plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon'])
plt.title("Average Petal Length by Species")
plt.ylabel("Petal Length (cm)")
plt.xlabel("Species")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


In [None]:
# Histogram of Sepal Width
plt.figure(figsize=(8, 5))
plt.hist(df['SepalWidthCm'], bins=15, color='purple', edgecolor='black')
plt.title("Distribution of Sepal Width")
plt.xlabel("Sepal Width (cm)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


In [None]:
# Scatter plot to show relationship between Sepal Length and Petal Length
plt.figure(figsize=(8, 5))
plt.scatter(df['SepalLengthCm'], df['PetalLengthCm'], c='blue', alpha=0.7)
plt.title("Sepal Length vs Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.grid(True)
plt.show()


In [None]:
## Plot Insights:
- Petal length shows clear trends by species.
- Sepal width has a normal distribution.
- There is a positive correlation between Sepal Length and Petal Length.


In [None]:
import seaborn as sns

In [None]:
# Bar chart comparing average petal length for each species
petal_length_col = 'PetalLengthCm' if 'PetalLengthCm' in df.columns else 'petal_length'
species_col = 'Species' if 'Species' in df.columns else 'species'

sns.barplot(x=species_col, y=petal_length_col, data=df, palette='viridis')
plt.title('Average Petal Length by Species', fontsize=16)
plt.xlabel('Species', fontsize=12)
plt.ylabel('Petal Length (cm)', fontsize=12)
plt.tight_layout()
plt.savefig('iris_bar_chart.png')
plt.show()

In [None]:
# Histogram: Distribution of sepal width
plt.figure(figsize=(12, 6))
# Ensure we have the correct column name
sepal_width_col = 'SepalWidthCm' if 'SepalWidthCm' in df.columns else 'sepal_width'

sns.histplot(data=df, x=sepal_width_col, kde=True, bins=20, color='skyblue')
plt.title('Distribution of Sepal Width', fontsize=16)
plt.xlabel('Sepal Width (cm)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.tight_layout()
plt.savefig('iris_histogram.png')
plt.show()