# Data Exploration Template

This notebook provides a template for exploring new datasets.

**Dataset:** [Replace with your dataset name]
**Date:** [Replace with current date]
**Author:** [Your name]

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plotting
%matplotlib inline
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 2. Load Data

In [None]:
# Load your dataset here
# Example: df = pd.read_csv('../data/raw/your_dataset.csv')

# For demonstration, let's create a sample dataset
np.random.seed(42)
n_samples = 1000

df = pd.DataFrame({
    'id': range(1, n_samples + 1),
    'age': np.random.randint(18, 65, n_samples),
    'income': np.random.normal(50000, 15000, n_samples),
    'score': np.random.uniform(0, 100, n_samples),
    'category': np.random.choice(['A', 'B', 'C', 'D'], n_samples),
    'is_active': np.random.choice([True, False], n_samples, p=[0.7, 0.3])
})

# Introduce some missing values
missing_indices = np.random.choice(df.index, size=int(0.05 * n_samples), replace=False)
df.loc[missing_indices, 'income'] = np.nan

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")

## 3. Initial Data Exploration

In [None]:
# Basic information about the dataset
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== DATA TYPES ===")
print(df.dtypes)
print("\n=== MEMORY USAGE ===")
print(df.memory_usage(deep=True))

In [None]:
# First few rows
print("=== FIRST 5 ROWS ===")
display(df.head())

print("\n=== LAST 5 ROWS ===")
display(df.tail())

In [None]:
# Statistical summary
print("=== STATISTICAL SUMMARY ===")
display(df.describe(include='all'))

## 4. Data Quality Assessment

In [None]:
# Missing values analysis
print("=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    display(missing_df)
else:
    print("No missing values found!")

In [None]:
# Duplicate rows
duplicates = df.duplicated().sum()
print(f"=== DUPLICATE ROWS ===")
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("\nDuplicate rows:")
    display(df[df.duplicated()])

In [None]:
# Data types and unique values
print("=== UNIQUE VALUES PER COLUMN ===")
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")
    
    # Show unique values for categorical columns
    if df[col].dtype == 'object' or unique_count < 10:
        print(f"  Values: {sorted(df[col].unique())}")
    print()

## 5. Data Visualization

In [None]:
# Distribution of numerical variables
numerical_cols = df.select_dtypes(include=[np.number]).columns

if len(numerical_cols) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, col in enumerate(numerical_cols[:4]):
        if i < len(axes):
            df[col].hist(bins=30, ax=axes[i], alpha=0.7)
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for j in range(len(numerical_cols), len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found for distribution plots.")

In [None]:
# Correlation matrix for numerical variables
if len(numerical_cols) > 1:
    plt.figure(figsize=(10, 8))
    correlation_matrix = df[numerical_cols].corr()
    
    sns.heatmap(correlation_matrix, 
                annot=True, 
                cmap='coolwarm', 
                center=0,
                square=True,
                fmt='.2f')
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
else:
    print("Need at least 2 numerical columns for correlation analysis.")

In [None]:
# Categorical variables visualization
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

if len(categorical_cols) > 0:
    fig, axes = plt.subplots(1, min(len(categorical_cols), 3), figsize=(15, 5))
    if len(categorical_cols) == 1:
        axes = [axes]
    
    for i, col in enumerate(categorical_cols[:3]):
        if i < len(axes):
            value_counts = df[col].value_counts()
            value_counts.plot(kind='bar', ax=axes[i], alpha=0.7)
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Count')
            axes[i].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No categorical columns found for visualization.")

## 6. Outlier Detection

In [None]:
# Box plots for outlier detection
if len(numerical_cols) > 0:
    fig, axes = plt.subplots(1, min(len(numerical_cols), 4), figsize=(16, 4))
    if len(numerical_cols) == 1:
        axes = [axes]
    
    for i, col in enumerate(numerical_cols[:4]):
        if i < len(axes):
            df.boxplot(column=col, ax=axes[i])
            axes[i].set_title(f'Box Plot: {col}')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical outlier detection using IQR
    print("=== OUTLIER DETECTION (IQR Method) ===")
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        outlier_percentage = (outlier_count / len(df)) * 100
        
        print(f"{col}: {outlier_count} outliers ({outlier_percentage:.2f}%)")
        if outlier_count > 0:
            print(f"  Range: [{lower_bound:.2f}, {upper_bound:.2f}]")
            print(f"  Outlier values: {sorted(outliers[col].values)[:10]}..." if outlier_count > 10 else f"  Outlier values: {sorted(outliers[col].values)}")
        print()
else:
    print("No numerical columns found for outlier detection.")

## 7. Summary and Next Steps

In [None]:
print("=== DATASET SUMMARY ===")
print(f"Dataset shape: {df.shape}")
print(f"Numerical columns: {len(numerical_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

print("\n=== NEXT STEPS ===")
print("1. Handle missing values if any")
print("2. Remove or investigate outliers")
print("3. Feature engineering if needed")
print("4. Prepare data for modeling")
print("5. Save cleaned data to '../data/processed/'")

print("\n=== RECOMMENDED ACTIONS ===")
if df.isnull().sum().sum() > 0:
    print("• Address missing values")
if df.duplicated().sum() > 0:
    print("• Remove duplicate rows")
if len(numerical_cols) > 0:
    print("• Consider scaling numerical features")
if len(categorical_cols) > 0:
    print("• Encode categorical variables if needed")

print("\nExploration complete! 🎉")

## 8. Save Processed Data (Optional)

In [None]:
# Uncomment the lines below to save your processed data

# # Create a copy for processing
# df_processed = df.copy()

# # Apply any cleaning steps here
# # Example: df_processed = df_processed.dropna()
# # Example: df_processed = df_processed.drop_duplicates()

# # Save to processed data folder
# import os
# os.makedirs('../data/processed', exist_ok=True)
# df_processed.to_csv('../data/processed/cleaned_dataset.csv', index=False)
# print("Processed data saved to '../data/processed/cleaned_dataset.csv'")

print("Data processing template ready for your customization!")