# Load and Inspect the Data

In [8]:
# notebooks/EDA.ipynb
# Import necessary libraries
import sys
sys.path.append('../src')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import custom modules
from data_loader import DataLoader
from eda import EDA

ModuleNotFoundError: No module named 'src'

In [None]:
# Load the data
data_loader = DataLoader('../data/raw/data.csv')
df = data_loader.load_data()

In [None]:
# Display the first few rows of the dataframe
df.head()

In [None]:
# Initialize the EDA class
eda = EDA(df)

In [None]:
# Overview of the data
overview = eda.overview()
print("Data Overview:\n", overview)

In [None]:
# Check data types
data_types = data_loader.check_data_types()
print("Data Types:\n", data_types)

# Check missing values
missing_values = data_loader.check_missing_values()
print("Missing Values:\n", missing_values)

In [None]:
# Summary Statistics
summary_stats = eda.summary_statistics()
print("Summary Statistics:\n", summary_stats)

In [None]:
# Distribution of Numerical Features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    eda.plot_distribution(feature)

In [None]:
# Distribution of Categorical Features
categorical_features = df.select_dtypes(include=['object']).columns
for feature in categorical_features:
    eda.plot_categorical_distribution(feature)

In [None]:
# Correlation Analysis
eda.plot_correlation_matrix()

In [None]:
# Identifying Missing Values
missing_values = eda.check_missing_values()
print("Missing Values:\n", missing_values)

In [None]:
# Outlier Detection
for feature in numerical_features:
    eda.detect_outliers(feature)

In [None]:
# Clean missing values
df_cleaned = data_loader.clean_missing_values(strategy='mean')

# Verify cleaning
missing_values_after_cleaning = data_loader.check_missing_values()
print("Missing Values After Cleaning:\n", missing_values_after_cleaning)

In [None]:
# Perform EDA
eda = EDA(df_cleaned)

# Plot the distribution of a numerical feature (e.g., Amount)
eda.plot_distribution('Amount')

# Plot the correlation matrix
eda.plot_correlation_matrix()