# Project Name 

**Project Type -** EDA

**Contribution -** Individual

**Rakshit Pandey**

## Project Summary

**Write the summary here within 500-600 words**.


## Github Link

**Provide your Github Link here.**



## Problem Statement

**Write Problem Statement Here.**


## Business Context

**Write Business Context Here**

### Define Your Business Objective ?

**Answer Here**


## Dataset Description

**Dataset** :

**Data overview** :

## Let's Begin !

### Project Setup -- constants 

In [1]:
data_url = '' # (support raw files only) - extension - {csv, xlsx, json, parquet}
skew_threshold = 0.5
outlier_threshold = 1.5

In [2]:
column_names = {}


### Prerequisite

In [3]:
# Library Installation

!pip install pandas numpy scipy scikit-learn matplotlib seaborn wordcloud --quiet

### Imports

In [102]:
# Importing necessary libraries

import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations, including mathematical functions like mean, median, sqrt, etc.

import matplotlib.pyplot as plt  # For creating a wide range of visualizations, such as bar plots, histograms, scatter plots, etc.
import seaborn as sns  # For advanced statistical visualizations, including pairplots, violin plots, and heatmaps

from pathlib import Path




In [5]:
# Optional: Setting styles for plots
sns.set_style("darkgrid")

### Utilities

#### Visualizatioin - Utitlity

In [74]:
# chart visualzation function

plot_functions = {
    'scatter': sns.scatterplot,
    'line': sns.lineplot,
    'bar': sns.barplot,
    'box': sns.boxplot,
    'hist': sns.histplot,
    'pie': plt.pie,
    'count': sns.countplot,
    'heatmap': sns.heatmap
}


def visualize_chart(chart_objs, nrows=1, ncols=1, **kwargs):
    """
    Create a custom visualization chart with optional subplots.

    Parameters:
    - chart_objs (list): List of dictionaries, where each dictionary contains
                          chart-specific information like 'plot_function', 'titles', etc.
    - nrows (int): Number of rows for the subplot grid. Default is 1.
    - ncols (int): Number of columns for the subplot grid. Default is 1.
    - **kwargs: Additional common keyword arguments passed to the plotting function.

    Returns:
    - fig (matplotlib.figure.Figure): The created figure object.
    """


    width = 18 if ncols == 1 else ncols * 5.43
    height = 6 if nrows == 1 else nrows * 4
    
    plt.figure(figsize=(width, height))
    
    # print(axes)
    
    # Loop through chart_objs to plot the respective data
    for i, chart in enumerate(chart_objs, 1):

        plot_function = chart['plot_function']
        title = chart['title']
        xlabel = chart.get('xlabel', None)
        ylabel = chart.get('ylabel', None)
        x = chart.get('x', None)
        y = chart.get('y', None)
        data = chart.get('data', None)
        chart_kwargs = chart.get('kwargs', {})

        if x is None and data is None:
            return "Please provide either value of data or x"
        
        plt.subplot(nrows, ncols, i)
        
        # Construct the plotting function arguments
        plot_args = {
            'data': data,
            'x': x,
        }
       
        if y is not None:
            plot_args['y'] = y
        
        # Add any additional keyword arguments for this specific chart
        plot_args.update(chart_kwargs)
        
        # Call the plot function
        ax = plot_function(**plot_args)

        plt.title(title, fontsize=16, pad=20)

        if xlabel is not None:
            plt.xlabel(xlabel)

        if ylabel is not None:
            plt.ylabel(ylabel)


        if plot_function in [plot_functions['bar'], plot_functions['count']]:
            for i in ax.containers:
                ax.bar_label(i, fmt='%.2f')

    # Adjust layout for better readability
    plt.tight_layout()

    fig = plt.gcf()

    return fig


#### DataFrame - Utitlity

In [7]:
def get_file_extension_from_path(input_path):
    # Use Path on system path (e.g., relative or absolute)
    file_extension = Path(input_path).suffix.lstrip('.')
    return file_extension.lower()

In [8]:

def create_dataframe(data_url: str) -> pd.DataFrame:
    """
    Reads raw data from a URL or file path and creates a DataFrame.

    Args:
        data_url (str): URL or file path of the raw data (e.g., GitHub raw file).
        data_type (str): The type of the data ('csv', 'xlsx', 'json', 'parquet').

    Returns:
        pd.DataFrame: A pandas DataFrame containing the data.
        
    """
    # Map data types to pandas functions
    read_functions = {
        "csv": pd.read_csv,
        "xlsx": pd.read_excel,
        "json": pd.read_json,
        "parquet": pd.read_parquet,
    }

    data_type = get_file_extension_from_path(data_url)
    
    if data_type not in read_functions:
        raise ValueError(f"Unsupported data type: {data_type}")

    try:
        # Use the appropriate pandas function to read the data
        df = read_functions[data_type](data_url)
        return df

    except Exception as e:
        raise RuntimeError(f"Failed to create DataFrame: {e}")


In [9]:
def analyze_dataset(df, exclude_columns=None):
    """
    Performs analysis on the dataset and stores the results.
    
    Args:
        df (pd.DataFrame): The DataFrame to analyze.
        exclude_columns (list): List of columns to exclude from summary statistics (optional).
    
    Returns:
        dict: A dictionary with the results of the analysis.
    """
    results = {}

    # General dataset info
    results['rows'], results['columns'] = df.shape

    # Missing values
    missing_count = df.isnull().sum()
    missing_details = missing_count[missing_count > 0].to_dict()
    total_missing = missing_count.sum()
    results['missing_values'] = {
        'total': total_missing,
        'percentage': (total_missing / (results['rows'] * results['columns'])) * 100,
        'details': missing_details
    }

    # Duplicate rows
    results['duplicate_rows'] = df.duplicated().sum()

    # Data types
    results['data_types'] = df.dtypes.reset_index()
    results['data_types'].columns = ['Column', 'DataType']

    # Exclude columns from analysis (if provided)
    numeric_df = df.select_dtypes(include=['number'])
    if exclude_columns:
        numeric_df = numeric_df.drop(columns=exclude_columns, errors='ignore')

    # Summary statistics
    results['statistics'] = numeric_df.describe().T
    results['statistics'] = results['statistics'].round(2)  # Format statistics to 2 decimal points

    # Additional observations for each column
    observations = {}
    for col in results['statistics'].index:
        stats = results['statistics'].loc[col]
        mean = stats['mean']
        median = stats['50%']
        if mean < median:
            observations[col] = "Data is left-skewed (mean < median)."
        elif mean > median:
            observations[col] = "Data is right-skewed (mean > median)."
        else:
            observations[col] = "Data is symmetric (mean ≈ median)."

    results['observations'] = observations

    return results


#### Correlation Utility

In [10]:
def visualize_correlation_matrix(df, columns=None, correlation_matrix=None):
    """
    Visualize the correlation matrix of numeric columns using a heatmap.
    
    Args:
        df (pd.DataFrame): The DataFrame to analyze and visualize.
        columns (list, optional): List of columns to calculate correlation matrix. 
                                  If None, the entire DataFrame will be used.
        correlation_matrix (pd.DataFrame, optional): Precomputed correlation matrix to visualize. 
                                                     If None, the correlation matrix will be computed from 'columns'.
    
    Returns:
        fig: The generated figure object.
    """
    # Compute the correlation matrix based on the inputs (columns or precomputed correlation_matrix)
    if correlation_matrix is None:
        # If correlation_matrix is not provided, compute it from the given columns or all numeric columns
        numeric_columns = columns if columns else df.select_dtypes(include=['number']).columns
        correlation_matrix = df[numeric_columns].corr(numeric_only=True)
    
    # Create chart_objs with the heatmap visualization for the correlation matrix
    chart_objs = [{
        'plot_function': plot_functions['heatmap'],
        'title': 'Relationship between Variables: Correlation Matrix',
        'xlabel': 'Features',
        'ylabel': 'Features',
        'x': correlation_matrix,  # Using the correlation matrix for visualization
        'kwargs': {
            'annot': True,  # Show correlation coefficients in the heatmap
            'cmap': 'viridis',  # Color map for the heatmap
            'fmt': '.2f',  # Format for the correlation values
            'linewidths': 0.7,  # Line thickness between cells
            'vmin': -1,
            'vmax': 1
            # 'cbar_kws': {'shrink': 0.75}  # Color bar size adjustment
        }
    }]
    
    # Visualize the correlation matrix using the visualize_chart function
    return visualize_chart(chart_objs, nrows=1, ncols=1)


In [11]:
def analyze_correlation_matrix(df, columns=None, threshold=None):
    """
    Analyze the correlation matrix of a DataFrame and optionally filter correlations by a threshold.

    Parameters:
        df (pd.DataFrame): The DataFrame to analyze.
        columns (list, optional): List of columns to include in the correlation analysis. If None, all numeric columns are used.
        threshold (float, optional): Correlation threshold to filter significant correlations. If None, no filtering is applied.

    Returns:
        dict: A dictionary containing the full correlation matrix and the filtered correlation matrix.
    """
    
    # Use only the specified columns or default to all numeric columns
    if columns is not None:
        # Check if the provided columns are numeric
        non_numeric_columns = [col for col in columns if df[col].dtype not in ['int64', 'float64']]
        
        if non_numeric_columns:
            print(f"Warning: The following non-numeric columns were excluded from skewness analysis: {', '.join(non_numeric_columns)}")
            # Remove non-numeric columns from the columns list
            columns = [col for col in columns if col not in non_numeric_columns]
        
    else:
        columns = df.select_dtypes(include='number').columns.tolist()
    
    if not columns:
        raise ValueError("No numeric columns available for correlation analysis.")

    # Compute the correlation matrix for the selected columns
    filtered_df = df[columns]
    correlation_matrix = filtered_df.corr()

    # Filter the correlation matrix based on the threshold
    if threshold is not None:
        filtered_matrix = correlation_matrix[abs(correlation_matrix) >= threshold].fillna(0)
    else:
        filtered_matrix = correlation_matrix  # No filtering applied if threshold is None


    visualize_correlation_matrix(df, correlation_matrix=correlation_matrix)
    plt.xticks(rotation=0)
    plt.show()
    
    # Return both the full and filtered correlation matrices
    return {
        "correlation_matrix": correlation_matrix,
        "filtered_correlation_matrix": filtered_matrix
    }


#### Skewness Utitlity

In [12]:
def calculate_skewness(df, columns=None):
    """
    Calculate skewness for specific columns or all numeric columns in the DataFrame.
    
    Args:
        df (pd.DataFrame): The DataFrame to analyze.
        columns (list): A list of column names for which skewness needs to be calculated. 
                          If None, skewness will be calculated for all numeric columns.
    
    Returns:
        dict: A dictionary with column names as keys and skewness values as values.
    """
    if columns is None:
        # If no column names are provided, calculate skewness for all numeric columns
        numeric_columns = df.select_dtypes(include=['number']).columns
    else:
        # Check if the provided columns are numeric
        non_numeric_columns = [col for col in columns if df[col].dtype not in ['int64', 'float64']]
        
        if non_numeric_columns:
            print(f"Warning: The following non-numeric columns were excluded from skewness analysis: {', '.join(non_numeric_columns)}")
            # Remove non-numeric columns from the columns list
            columns = [col for col in columns if col not in non_numeric_columns]

    # Calculate skewness for the selected columns
    skewness_dict = {col: df[col].skew() for col in numeric_columns}
    return skewness_dict


In [13]:
def visualize_skewness_with_chart(df, numeric_columns):
    """
    Visualize skewness of all numeric columns using histograms with KDE.
    
    Args:
        df (pd.DataFrame): The DataFrame to analyze and visualize.
    
    Returns:
        fig: The generated figure object.
    """
    # Identify numeric columns
    # numeric_columns = df.select_dtypes(include=['number']).columns
    
    # Calculate skewness for each numeric column
    skewness = df[numeric_columns].skew().round(2)

    # print(skewness)
    
    # Create chart_objs with customized titles and other options for each column
    chart_objs = []
    for col in numeric_columns:
        # print(col, skewness[col])
        chart_objs.append({
            'plot_function':plot_functions['hist'] ,
            'title': f'Skewness of {col}: {skewness[col]}',  # Title for each individual column
            'xlabel': col,
            'ylabel': 'Frequency',
            'x': df[col],
            'kwargs': {'kde': True, 'color': 'purple', 'element': 'poly'}
        })


    # print(chart_objs)
    
    # Visualize skewness using the visualize_chart function
    return visualize_chart(chart_objs, nrows=(len(numeric_columns) // 3 + 1), ncols=3)

# # Example: Visualize the skewness of all numeric columns in the DataFrame
# fig = visualize_skewness_with_chart(df)
# plt.show()




In [14]:
def analyze_skewness(df, columns=None):
    """
    Analyze the skewness of numeric columns in the dataset and generate a summary of skewness categories.
    
    Args:
        df (pd.DataFrame): The DataFrame to analyze.
        columns (list, optional): List of columns to include in the skewness analysis. 
                                   If None, all numeric columns will be considered.
    
    Returns:
        dict: A dictionary containing the skewness analysis result categorized by 'high', 'moderate', and 'low'.
        plt.Figure: A Matplotlib figure object displaying histograms of the skewness.
    """
    # Determine the columns to analyze (use all numeric columns if no specific columns are provided)
    if columns is None:
        columns = df.select_dtypes(include="number").columns.tolist()
    
    # Check if the provided columns are numeric
    non_numeric_columns = [col for col in columns if df[col].dtype not in ['int64', 'float64']]
    
    if non_numeric_columns:
        print(f"Warning: The following non-numeric columns were excluded from skewness analysis: {', '.join(non_numeric_columns)}")
        # Remove non-numeric columns from the columns list
        columns = [col for col in columns if col not in non_numeric_columns]

    # If there are no numeric columns to analyze, raise an error
    if not columns:
        raise ValueError("No numeric columns available in the dataset for skewness analysis.")
    
    # Calculate skewness for each numeric column
    skewness_values = df[columns].apply(lambda x: x.skew()).to_dict()

    # Categorize columns based on skewness values
    high_skew = [col for col, skew in skewness_values.items() if abs(skew) > 1]
    moderate_skew = [col for col, skew in skewness_values.items() if 0.5 < abs(skew) <= 1]
    low_skew = [col for col, skew in skewness_values.items() if abs(skew) <= 0.5]

    # Prepare the result object
    skewness_result = {
        "high_skew": high_skew,
        "moderate_skew": moderate_skew,
        "low_skew": low_skew,
        "skewness_values": skewness_values  # For detailed skewness values of each column
    }

    # print(skewness_result)
    # Generate and visualize skewness distribution for the specified columns
    visualize_skewness_with_chart(df, columns)

    return skewness_result


#### DataFrame - Manipulation - Utility

##### Dtype - conversion _(changing column dtype)_

In [15]:
def transform_columns(df, columns, new_type):
    """
    Converts specified columns to a new data type.
    
    Args:
        df (pd.DataFrame): The DataFrame to transform.
        columns (list): List of column names to convert.
        new_type (type): The new data type (e.g., int, float, str).
    
    Returns:
        pd.DataFrame: DataFrame with updated column types.
    """
    for column in columns:
        try:
            df[column] = df[column].astype(new_type)
            print(f"Column '{column}' successfully converted to {new_type}.")
        except Exception as e:
            print(f"Error converting column '{column}' to {new_type}: {e}")
    return df

##### Column Imputation -- _(Handling missing values)_

In [16]:
def impute_column(col, method):
    """
    Impute missing values in a column based on the specified method.
    
    Args:
        col (pd.Series): The column to impute.
        method (str): The imputation method ('median', 'mean', or 'mode').
        
    Returns:
        pd.Series: The column with imputed values.
    """
    if method == 'median':
        return col.fillna(col.median())
    elif method == 'mean':
        return col.fillna(col.mean())
    elif method == 'mode':
        return col.fillna(col.mode()[0])
    else:
        raise ValueError(f"Unsupported imputation method: {method}")



In [17]:
def impute_missing(df, exclude_columns=None):
    """
    Impute missing values for numeric, categorical, datetime, and boolean columns.
    
    Args:
        df (pd.DataFrame): The dataset to process.
        exclude_columns (list): Columns to exclude from the imputation process.
        
        
    Returns:
        pd.DataFrame: DataFrame with imputed missing values.
    """
    
    # If no exclude_columns are provided, initialize as an empty list
    exclude_columns = exclude_columns or []
    
    for col in df.columns:
        if col not in exclude_columns and df[col].isnull().any():  # Check if the column has missing values
            dtype = df[col].dtype

            # Handle numeric columns
            if dtype in ['int64', 'float64']:
                skew = df[col].skew()  # Calculate skewness for numeric columns
                if abs(skew) > 0.5:
                    df[col] = impute_column(df[col], 'median')
                else:
                    df[col] = impute_column(df[col], 'mean')
            # Handle categorical, boolean, and datetime columns
            elif dtype in ['object', 'category', 'bool', 'datetime64[ns]']:
                df[col] = impute_column(df[col], 'mode')

    return df


#### Handling Outliers

In [18]:
def remove_outliers(df, col, lower, upper):
    """Remove rows with outliers for the given column."""
    return df[(df[col] >= lower) & (df[col] <= upper)]

In [19]:
def flag_outliers(df, col, lower, upper):
    """Flag outliers for the given column by adding a new column."""
    df[f"{col}_outlier"] = (df[col] < lower) | (df[col] > upper)
    return df

In [20]:
def transform_outliers(df, col, lower, upper):
    """Clip outliers for the given column to the bounds."""
    df[col] = df[col].clip(lower=lower, upper=upper)
    return df


In [21]:
def handle_outliers_in_data(df, columns=None, method='remove', threshold=1.5):
    """
    Detect and handle outliers in the DataFrame using the IQR method without using if-else.
    
    Args:
        df (pd.DataFrame): The DataFrame to check for outliers.
        columns (list, optional): List of columns to check for outliers. If None, all numeric columns are used.
        method (str): Method to handle outliers - 'remove', 'flag', or 'transform'.
        threshold (float): The IQR multiplier for detecting outliers.
    
    Returns:
        pd.DataFrame: DataFrame with outliers handled (removed, flagged, or transformed).
        pd.DataFrame: DataFrame containing only the outliers.
    """
    # Select numeric columns if no specific columns are provided
    columns = columns or df.select_dtypes(include=["number"]).columns.tolist()

    # DataFrames for results
    df_result = df.copy()
    outliers_list = []

    # Map methods to functions
    method_actions = {
        'remove': remove_outliers,
        'flag': flag_outliers,
        'transform': transform_outliers,
    }

    # Raise error for invalid method
    if method not in method_actions:
        raise ValueError(f"Invalid method '{method}'. Choose from 'remove', 'flag', or 'transform'.")

    # Loop through each column and handle outliers
    for col in columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' not found in the DataFrame. Skipping...")
            continue

        # Calculate IQR bounds
        Q1, Q3 = df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        # Identify outliers
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers_list.append(outliers)

        print(f"Outliers detected for '{col}': {len(outliers)}")

        # Call the appropriate method
        df_result = method_actions[method](df_result, col, lower_bound, upper_bound)

    # Concatenate all outliers into a single DataFrame
    df_outliers = pd.concat(outliers_list).drop_duplicates()

    return df_result, df_outliers

#### Data Transformation - _(log, sqrt)_

In [22]:
def apply_transformation(df, column, transformation_type):
    """
    Apply the specified transformation to a column in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the column to transform.
        column (str): The name of the column to transform.
        transformation_type (str): The type of transformation to apply. 
                                   Supported: 'log', 'sqrt', 'square', 'reciprocal'.

    Returns:
        pd.Series: Transformed column as a pandas Series.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    # Transformation mapping
    transformations = {
        'log': lambda x: np.log1p(x),
        'sqrt': lambda x: np.sqrt(x.clip(lower=0)),
        'square': lambda x: np.square(x),
        'reciprocal': lambda x: 1 / x.replace(0, np.nan)
    }

    # Get the transformation function
    transform_func = transformations.get(transformation_type)
    if not transform_func:
        raise ValueError(f"Unsupported transformation type: '{transformation_type}'.")

    # Apply the transformation
    return transform_func(df[column])


In [23]:
def best_transformation_with_outliers(df, skew_categories, handle_outliers=False):
    """
    Apply the best transformation based on skewness for each column already categorized into high, moderate, or low skew.
    Optionally, handle outliers by removing or transforming them.

    Args:
        df (pd.DataFrame): The DataFrame containing numeric columns.
        skew_categories (dict): Dictionary with keys 'high', 'moderate', 'low' mapping to lists of column names.
        handle_outliers (bool): Whether to handle outliers by removal after transformation.
        skew_threshold (float): The threshold for determining the degree of skewness for transformation.
        outlier_threshold (float): The threshold for detecting outliers based on IQR method.

    Returns:
        pd.DataFrame: DataFrame with transformations applied to columns and outliers handled (if applicable).
    """
    transformed_df = df.copy()  # Create a copy to apply transformations

    # Dictionary of transformations for each skew category
    transformations = {
        'high_skew': 'log',
        'moderate_skew': 'sqrt'
    }
    
    # 'low': 'boxcox'  # Optional, or we can skip transformation for low skew

    transformed_col = []
    # Apply transformations based on skew category
    for skew_category, transformation_type in transformations.items():
        columns = skew_categories.get(skew_category, [])
        for col in columns:
            print(f"Applying {transformation_type} transformation to {col} due to {skew_category} skewness.")
            transformed_df[col] = apply_transformation(transformed_df, col, transformation_type)
            transformed_col.append(col)

    
    # Handle outliers if required
    if handle_outliers:
        transformed_df = handle_outliers_in_data(transformed_df, transformed_col)
    
    return transformed_df

#### Summary Generation -- _(Factory class to register and use summary related functions)_

In [24]:
# Define the Factory Class for Summaries
class SummaryFactory:
    def __init__(self):
        self.steps = {}

    def register_step(self, step_name, step_function):
        """
        Registers a summary step to the factory.
        
        Args:
            step_name (str): The name of the step.
            step_function (function): The function to generate the summary for this step.
        """
        self.steps[step_name] = step_function

    def generate_summary(self, step_name, *args, **kwargs):
        """
        Generates the summary for the specified step.
        
        Args:
            step_name (str): The name of the step to execute.
            *args: Positional arguments for the step function.
            **kwargs: Keyword arguments for the step function.
        
        Returns:
            str: The generated summary.
        """
        if step_name not in self.steps:
            raise ValueError(f"Step '{step_name}' is not registered in the factory.")
        return self.steps[step_name](*args, **kwargs)


In [25]:

# Step 1: Overview Function
def overview_step(result):
    """
    Generates an overview summary from the dataset analysis results.
    
    Args:
        result (dict): The analysis results dictionary.
    
    Returns:
        str: Overview summary.
    """
    summary = []

    # General dataset information
    summary.append(f"The dataset contains {result['rows']} rows and {result['columns']} columns.\n")

    # Missing values
    missing_values = result['missing_values']
    summary.append(f"There are {missing_values['total']} missing values across {len(missing_values['details'])} columns.")
    summary.append(f"Missing values account for {missing_values['percentage']:.2f}% of the dataset.")
    if missing_values['details']:
        summary.append("Columns with missing values and their counts:")
        for col, count in missing_values['details'].items():
            summary.append(f"  - {col}: {count} missing values")
    summary.append("")  # Add a blank line for spacing

    # Duplicate rows
    duplicate_rows = result['duplicate_rows']
    if duplicate_rows > 0:
        summary.append(f"There are {duplicate_rows} duplicate rows in the dataset.")
    else:
        summary.append("There are no duplicate rows in the dataset.")
    summary.append("")  # Add a blank line for spacing

    # Data types
    summary.append("Data Types:\n")
    data_types = result['data_types']
    
    summary.append(data_types.to_string(index=False))

    # for _, row in data_types.iterrows():
    #     summary.append(f"  - {row['Column']}: {row['DataType']}")
    
    summary.append("")  # Add a blank line for spacing

    # Summary Statistics
    summary.append("Summary Statistics:\n")
    statistics = result['statistics']
    summary.append(statistics.to_string())  # Use pandas' `to_string` for a clean table-like output

    return "\n".join(summary)



In [26]:
# Step 2: Observations Function
def observations_step(result):
    """
    Generates observations based on numerical analysis.
    
    Args:
        result (dict): The analysis results dictionary.
    
    Returns:
        str: Observations summary.
    """
    summary = []

    # Observations from numerical columns
    summary.append("Observations based on the dataset:\n")
    observations = result.get('observations', {})
    
    for col, observation in observations.items():
        summary.append(f"  - {col}: {observation}")

    return "\n".join(summary)

In [27]:
def skewness_summary(skewness_object):
    """
    Generate a textual and tabular summary of skewness using the skewness object.

    Args:
        skewness_object (dict): A dictionary containing skewness analysis results.

    Returns:
        str: A formatted textual and tabular summary of the skewness analysis.
    """
    summary = []

    # Categorize and format the summary based on the skewness object
    if skewness_object.get("high_skew"):
        summary.append(f"- Highly skewed columns (suggesting log transformation): {', '.join(skewness_object['high_skew'])}")
    if skewness_object.get("moderate_skew"):
        summary.append(f"- Moderately skewed columns (suggesting square root transformation): {', '.join(skewness_object['moderate_skew'])}")
    if skewness_object.get("low_skew"):
        summary.append(f"- Columns with low skewness (no transformation needed): {', '.join(skewness_object['low_skew'])}")
    
    # Convert detailed skewness values into a table
    if skewness_object.get("skewness_values"):
        skewness_df = pd.DataFrame.from_dict(skewness_object["skewness_values"], orient="index", columns=["Skewness"])
        skewness_df.index.name = "Column"
        skewness_df.reset_index(inplace=True)
        summary.append("\nDetailed Skewness Values:\n")
        summary.append(skewness_df.to_string(index=False))

    return "\n".join(summary)


In [28]:
summary_factory = SummaryFactory()

# Register steps
summary_factory.register_step("overview", overview_step)
summary_factory.register_step("observations", observations_step)
summary_factory.register_step("skewness", skewness_summary)

# print(summary_factory.generate_summary("overview", result)) # to generate summary

### Custom Utility

## Exploratory Data Analysis

### Data Preparation

#### Load the Netflix dataset (CSV file).

In [29]:
# Creating dataframe

df = create_dataframe(data_url)

#### First View

In [None]:
df.head()

In [None]:
df.tail()

In [32]:
# excluding non-numeric and columns that are not needed in descriptive summary function like df.describe(), correlation and skewness etc

exclude_columns = {'id'}

#### Inspect the structure of the dataset.

In [33]:
result = analyze_dataset(df, exclude_columns) 

#### Summary

In [None]:
print(summary_factory.generate_summary("overview", result))
print("\n" + "="*80 + "\n")
print(summary_factory.generate_summary("observations", result))

In [None]:
display(df.describe(include=['object']).T)

### Handling Missing Values

In [None]:
df.isnull().any()

**Handling Missig Values - Rating**

In [None]:
df.dtypes

### Correlation

In [None]:
filter_column = list(set(df.columns) - exclude_columns)
filter_column

In [None]:
result = analyze_correlation_matrix(df, filter_column)

### Skewness

In [None]:
skewness = analyze_skewness(df, filter_column)

In [None]:
print(summary_factory.generate_summary("skewness", skewness))

### Outliers

In [None]:
chart_objs = []
for col in filter_column:
    # print(col, skewness[col])
    chart_objs.append({
    'plot_function':plot_functions['box'] ,
    'title': f'Boxplot of {col}',  # Title for each individual column
    'xlabel': col,
    'ylabel': None,
    'x': df[col],
    'kwargs': { 'color': 'purple'}
    })
    

visualize_chart(chart_objs, nrows=(len(filter_column) // 3 + 1), ncols=3)
plt.show()

In [None]:
df1, outlier_df = best_transformation_with_outliers(df, skewness, handle_outliers=True)

In [None]:
outlier_df.shape, df1.shape

## Analysis

## Outliers - observations

#### Inspect the structure of the dataset.

In [None]:
result = analyze_dataset(outlier_df, exclude_columns) 

#### Summary

In [None]:
print(summary_factory.generate_summary("overview", result))
print("\n" + "="*80 + "\n")
print(summary_factory.generate_summary("observations", result))

In [None]:
display(outlier_df.describe(include=['object']).T)

### Handling Missing Values

In [None]:
outlier_df.isnull().any()

**Handling Missig Values - Rating**

In [None]:
outlier_df.dtypes

### Correlation

In [None]:
filter_column = list(set(outlier_df.columns) - exclude_columns)
filter_column

In [None]:
result = analyze_correlation_matrix(outlier_df, filter_column)

### Skewness

In [None]:
skewness = analyze_skewness(outlier_df, filter_column)

In [None]:
print(summary_factory.generate_summary("skewness", skewness))