<a href="https://colab.research.google.com/github/mdpw/msc-ai-cw/blob/main/full_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [None]:
# 1. Load the dataset
file_path = "garment_defect_dataset.csv"  # Update if needed
df = pd.read_csv(file_path)

In [None]:
# 2. Basic information
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

In [None]:




# 3. Basic statistics
print("\nDescriptive Statistics:")
print(df.describe())

# 4. Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# 5. Class distribution (IsDefective)
print("\nTarget Distribution:")
print(df['IsDefective'].value_counts())
sns.countplot(x='IsDefective', data=df)
plt.title('IsDefective Distribution')
plt.show()

# 6. Correlation Matrix (Numerical Features Only)
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()

print("\nCorrelation Matrix:")
print(corr_matrix)

# Plot Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# 7. Distribution of each numerical feature
numeric_cols = numeric_df.columns
df[numeric_cols].hist(bins=20, figsize=(15, 12), edgecolor='black')
plt.suptitle('Feature Distributions', fontsize=16)
plt.show()

# 8. Pairplot (optional - can be slow for large datasets)
sns.pairplot(df[numeric_cols[:5].to_list() + ['IsDefective']], hue='IsDefective')
plt.show()

# 9. Outlier detection (Boxplots)
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df, x='IsDefective', y=col)
    plt.title(f'Boxplot of {col} by IsDefective')
    plt.show()

# 10. Convert DataFrame to PyTorch tensors for model input
# Separate features and labels
X = df.drop(columns=['IsDefective']).values
y = df['IsDefective'].values

# Convert to tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

print("\nPyTorch Tensor Shapes:")
print("X_tensor:", X_tensor.shape)
print("y_tensor:", y_tensor.shape)
