# EDA (Exploratory Data Analysis) of the dataset

In this notebook, explore the Abalone dataset, by showing relevant visualizations that help understand the problem you are modelling.

Please make sure to write down your conclusions in the final notebook and to remove these intructions.

# Imports

In [12]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 500)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'matplotlib'

# Data

In [6]:
df = pd.read_csv("../data/abalone.csv")
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


# EDA

In [7]:
## 1. Dataset Overview


In [None]:
# Basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nDataset info:")
df.info()


In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")


In [None]:
# Statistical summary
df.describe()


## 2. Target Variable Analysis (Rings)


In [None]:
# Distribution of Rings (age indicator)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['Rings'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Number of Rings')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Rings (Age)')
axes[0].axvline(df['Rings'].mean(), color='red', linestyle='--', label=f'Mean: {df["Rings"].mean():.2f}')
axes[0].axvline(df['Rings'].median(), color='green', linestyle='--', label=f'Median: {df["Rings"].median():.2f}')
axes[0].legend()

# Box plot
axes[1].boxplot(df['Rings'], vert=True)
axes[1].set_ylabel('Number of Rings')
axes[1].set_title('Box Plot of Rings')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Target variable (Rings) statistics:")
print(f"Mean: {df['Rings'].mean():.2f}")
print(f"Median: {df['Rings'].median():.2f}")
print(f"Std: {df['Rings'].std():.2f}")
print(f"Min: {df['Rings'].min()}")
print(f"Max: {df['Rings'].max()}")


## 3. Categorical Feature Analysis


In [None]:
# Sex distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
sex_counts = df['Sex'].value_counts()
axes[0].bar(sex_counts.index, sex_counts.values, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Sex')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Sex')
for i, v in enumerate(sex_counts.values):
    axes[0].text(i, v + 50, str(v), ha='center', va='bottom')

# Box plot: Rings by Sex
df.boxplot(column='Rings', by='Sex', ax=axes[1])
axes[1].set_xlabel('Sex')
axes[1].set_ylabel('Number of Rings')
axes[1].set_title('Rings Distribution by Sex')
plt.suptitle('')

plt.tight_layout()
plt.show()

print("Sex categories:")
print(df['Sex'].value_counts())
print("\nAverage rings by sex:")
print(df.groupby('Sex')['Rings'].mean().sort_values(ascending=False))


## 4. Numerical Features Distribution


In [None]:
# Distribution of all numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].axvline(df[col].mean(), color='red', linestyle='--', linewidth=2, label='Mean')
    axes[idx].legend()

plt.tight_layout()
plt.show()


## 5. Correlation Analysis


# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

# Correlation with target variable
print("Correlation with Rings (target):")
print(correlation_matrix['Rings'].sort_values(ascending=False))


## 6. Feature Relationships with Target


m

In [None]:
# Scatter plots: Features vs Rings
features_to_plot = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shell weight']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, feature in enumerate(features_to_plot):
    axes[idx].scatter(df[feature], df['Rings'], alpha=0.5, s=10)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Rings')
    axes[idx].set_title(f'{feature} vs Rings')
    
    # Add trend line
    z = np.polyfit(df[feature], df['Rings'], 1)
    p = np.poly1d(z)
    axes[idx].plot(df[feature].sort_values(), p(df[feature].sort_values()), 
                   "r--", alpha=0.8, linewidth=2)
    
    # Add correlation
    corr = df[feature].corr(df['Rings'])
    axes[idx].text(0.05, 0.95, f'Corr: {corr:.3f}', 
                   transform=axes[idx].transAxes, 
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                   verticalalignment='top')

# Hide the last subplot if not needed
if len(features_to_plot) < 6:
    axes[5].set_visible(False)

plt.tight_layout()
plt.show()


## 7. Outlier Detection


In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].boxplot(df[col], vert=True)
    axes[idx].set_ylabel(col)
    axes[idx].set_title(f'Box Plot: {col}')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Identify outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

print("Outlier detection using IQR method:")
print("-" * 60)
for col in numerical_cols:
    n_outliers, lower, upper = detect_outliers_iqr(df, col)
    print(f"{col:20s}: {n_outliers:4d} outliers (range: [{lower:.3f}, {upper:.3f}])")


## 8. Pairplot for Key Features


In [None]:
# Pairplot for selected features
selected_features = ['Length', 'Diameter', 'Whole weight', 'Shell weight', 'Rings', 'Sex']
sns.pairplot(df[selected_features], hue='Sex', diag_kind='kde', corner=True, 
             plot_kws={'alpha': 0.6, 's': 20}, height=2.5)
plt.suptitle('Pairplot of Key Features by Sex', y=1.01, fontsize=16)
plt.tight_layout()
plt.show()


## 9. Key Insights and Conclusions

Based on the exploratory data analysis above, here are the key findings:

### Dataset Characteristics:
- The dataset contains physical measurements of abalone specimens
- Target variable is the number of rings, which indicates age
- Features include physical dimensions (Length, Diameter, Height) and weights (Whole, Shucked, Viscera, Shell)
- Sex has three categories: M (Male), F (Female), and I (Infant)

### Key Observations:
1. **Missing Values**: [To be filled after running the analysis]

2. **Target Variable (Rings)**:
   - Distribution appears to be [describe after running]
   - Most common age ranges are [describe after running]

3. **Correlations**:
   - Strong correlations exist between physical dimensions and weights
   - Shell weight shows [strong/moderate/weak] correlation with age
   - [Other notable correlations]

4. **Sex Differences**:
   - Infants tend to have [fewer/more] rings on average
   - [Other sex-based patterns]

5. **Feature Relationships**:
   - Linear relationships exist between [features]
   - Non-linear patterns observed in [features]

### Modeling Implications:
- **Useful features**: Shell weight, Diameter, and other weight measurements show promise
- **Feature engineering**: May benefit from creating interaction terms or ratios
- **Multicollinearity**: High correlation between some features may require feature selection
- **Outliers**: [Assessment of outlier impact]
- **Model choice**: Regression models appropriate for continuous target (Rings)
