# Data Exploration: Validation Test\n\n**Created:** 2026-01-16 17:24\n\n## Objective\n\nExplore and analyze dataset to understand its structure, quality, and characteristics.

## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports complete")

## 2. Load Data

In [None]:
# Load dataset
# TODO: Update path to your data file
file_path = 'data/dataset.csv'

try:
    df = pd.read_csv(file_path)
    print(f"✓ Loaded {len(df):,} rows and {len(df.columns)} columns")
except FileNotFoundError:
    print(f"✗ File not found: {file_path}")
    df = None

## 3. Dataset Overview

In [None]:
# Display first few rows
df.head()

In [None]:
# Dataset info
print("Dataset Information:")
print("-" * 50)
print(f"Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nColumn Types:")
print(df.dtypes.value_counts())

## 4. Data Quality Checks

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
}).sort_values('Percentage', ascending=False)

print("Missing Values:")
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates:,} ({(duplicates/len(df)*100):.2f}%)")

## 5. Statistical Summary

In [None]:
# Numerical columns summary
df.describe()

In [None]:
# Categorical columns summary
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print("Categorical Columns:")
    for col in categorical_cols:
        unique_count = df[col].nunique()
        print(f"\n{col}:")
        print(f"  Unique values: {unique_count}")
        if unique_count < 20:
            print(f"  Value counts:\n{df[col].value_counts().head(10)}")

## 6. Visualizations

In [None]:
# Distribution of numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns

if len(numerical_cols) > 0:
    n_cols = min(len(numerical_cols), 4)
    fig, axes = plt.subplots(
        nrows=(len(numerical_cols) + n_cols - 1) // n_cols,
        ncols=n_cols,
        figsize=(16, 4 * ((len(numerical_cols) + n_cols - 1) // n_cols))
    )
    axes = axes.flatten() if len(numerical_cols) > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        df[col].hist(bins=30, ax=axes[idx], edgecolor='black')
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
    
    # Hide empty subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation heatmap
if len(numerical_cols) > 1:
    plt.figure(figsize=(12, 8))
    correlation = df[numerical_cols].corr()
    sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.show()

## 7. Key Insights\n\n### Observations\n- TODO: Add your observations\n\n### Data Quality Issues\n- TODO: Note any data quality problems\n\n### Next Steps\n- TODO: List next steps for analysis