### How to use it:

**Install Libraries:** 

pip install pandas numpy matplotlib seaborn
Use code with caution.
Bash
Save the Code: Save the code as a Python file (e.g., eda_script.py).

Run the Script:

python eda_script.py
Use code with caution.
Bash
(Make sure you replace 'your_file.csv' with the actual path to your CSV file within the script).

Interpret the Output: The script will print information to the console and generate plots that you can analyze to understand your data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def explore_eda(csv_file_path, sample_size=5, plot_numerical_distributions=True, plot_categorical_counts=True):
    """
    Performs Exploratory Data Analysis (EDA) on a CSV file.

    Args:
        csv_file_path (str): Path to the CSV file.
        sample_size (int): Number of rows to display in the sample. Defaults to 5.
        plot_numerical_distributions (bool): Whether to plot histograms of numerical features. Defaults to True.
        plot_categorical_counts (bool): Whether to plot bar plots of categorical feature counts. Defaults to True.
    """

    try:
        # 1. Load the data
        df = pd.read_csv(csv_file_path)
        print(f"Successfully loaded data from: {csv_file_path}\n")

    except FileNotFoundError:
        print(f"Error: File not found at {csv_file_path}")
        return
    except pd.errors.EmptyDataError:
        print(f"Error: The CSV file at {csv_file_path} is empty.")
        return
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # 2. Basic Information

    print("----- DATASET OVERVIEW -----")
    print(f"Shape: {df.shape}")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print("\n")

    print("----- DATA TYPES -----")
    print(df.dtypes)
    print("\n")

    print("----- MISSING VALUES -----")
    print(df.isnull().sum())
    print("\n")

    print("----- DUPLICATED VALUES -----")
    print(f"Number of duplicate rows: {df.duplicated().sum()}")
    print("\n")

    print("----- SAMPLE OF DATA -----")
    print(df.sample(sample_size))
    print("\n")


    # 3. Descriptive Statistics

    print("----- DESCRIPTIVE STATISTICS (Numerical) -----")
    print(df.describe())
    print("\n")

    # Include object (string) columns in the descriptive statistics
    print("----- DESCRIPTIVE STATISTICS (Categorical) -----")
    print(df.describe(include=['object']))
    print("\n")



    # 4. Feature Exploration and Visualization

    numerical_features = df.select_dtypes(include=np.number).columns.tolist()
    categorical_features = df.select_dtypes(exclude=np.number).columns.tolist()

    print("Numerical Features:", numerical_features)
    print("Categorical Features:", categorical_features)
    print("\n")


    if plot_numerical_distributions:
        print("----- Numerical Feature Distributions -----")
        num_numerical = len(numerical_features)
        if num_numerical > 0:
            num_cols = min(3, num_numerical)
            num_rows = (num_numerical + num_cols - 1) // num_cols # Calculate required number of rows
            fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols * 6, num_rows * 4)) #adjust size to fit columns

            axes = axes.flatten()  # Flatten the axes array for easier indexing
            for i, feature in enumerate(numerical_features):
                sns.histplot(data=df, x=feature, ax=axes[i], kde=True) #kde for density
                axes[i].set_title(f"Distribution of {feature}")

            #remove unused subplots
            for i in range(num_numerical, len(axes)):
                 fig.delaxes(axes[i])

            plt.tight_layout()  # Adjust layout to prevent overlapping titles
            plt.show()
        else:
            print("No numerical features to plot distributions for.")
        print("\n")


    if plot_categorical_counts:
        print("----- Categorical Feature Counts -----")
        num_categorical = len(categorical_features)
        if num_categorical > 0:
            num_cols = min(3, num_categorical)
            num_rows = (num_categorical + num_cols - 1) // num_cols  # Calculate required rows
            fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols * 8, num_rows * 6)) #adjust size

            axes = axes.flatten() # Flatten the axes array

            for i, feature in enumerate(categorical_features):
                counts = df[feature].value_counts()
                sns.barplot(x=counts.index, y=counts.values, ax=axes[i])
                axes[i].set_title(f"Count of {feature}")
                axes[i].tick_params(axis='x', rotation=45) #rotate labels for readability

            #remove unused subplots
            for i in range(num_categorical, len(axes)):
                 fig.delaxes(axes[i])

            plt.tight_layout()
            plt.show()
        else:
            print("No categorical features to plot counts for.")
        print("\n")


    # 5.  Correlation Analysis (for numerical features)

    if len(numerical_features) > 1:  # Correlation needs at least two numerical features
        print("----- CORRELATION MATRIX -----")
        correlation_matrix = df[numerical_features].corr()

        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
        plt.title("Correlation Matrix of Numerical Features")
        plt.show()
        print("\n")
    else:
        print("Not enough numerical features to calculate correlation matrix.")



    # 6.  Further Analysis Ideas (commented out - add as needed)

    # - Bivariate Analysis (scatter plots, box plots, etc.)
    # - Outlier Detection and Handling
    # - Feature Engineering
    # - Grouped analysis (e.g., average sales by region)
    # - Time series analysis (if data includes dates/times)

    print("----- EDA Completed -----")

# Example Usage:
if __name__ == "__main__":
    # Replace 'your_file.csv' with the actual path to your CSV file
    file_path = 'your_file.csv'
    explore_eda(file_path)