## Import Requisite Lbraries

In [1]:
import pandas as pd
import numpy as np
import os

## Ensure Directory

In [2]:
from eda_toolkit import ensure_directory

import os  # import operating system for dir

base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to parent directory,
# then into the 'data' folder
data_path = os.path.join(os.pardir, "data")
data_output = os.path.join(os.pardir, "data_output")

# create image paths
image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure'data' directory exists
ensure_directory(data_path)
ensure_directory(data_output)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

Directory exists: ../data
Directory exists: ../data_output
Directory exists: ../images/png_images
Directory exists: ../images/svg_images


## UCI ML Repository

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# Combine X and y into entire df
df = X.join(y, how="inner")

df.to_csv(os.path.join(data_path, "adult_income.csv"))

In [12]:
from eda_toolkit import generate_table1

table1 = generate_table1(
    df=df,
    value_counts=True,
    max_categories=3,
    export_markdown=True,
    decimal_places=0,
    markdown_path="table1_summary.md",
)

table1

Unnamed: 0,Variable,Type,Mean,SD,Median,Min,Max,Mode,Missing (n),Missing (%),Count,Proportion (%)
0,age,Continuous,39.0,14.0,37.0,17.0,90.0,36,0,0,48842,100
1,capital-gain,Continuous,1079.0,7452.0,0.0,0.0,99999.0,0,0,0,48842,100
2,capital-loss,Continuous,88.0,403.0,0.0,0.0,4356.0,0,0,0,48842,100
3,education-num,Continuous,10.0,3.0,10.0,1.0,16.0,9,0,0,48842,100
4,fnlwgt,Continuous,189664.0,105604.0,178144.0,12285.0,1490400.0,203488,0,0,48842,100
5,hours-per-week,Continuous,40.0,12.0,40.0,1.0,99.0,40,0,0,48842,100
6,workclass = Private,Categorical,,,,,,Private,963,2,33906,69
7,workclass = Self-emp-not-inc,Categorical,,,,,,Private,963,2,3862,8
8,workclass = Local-gov,Categorical,,,,,,Private,963,2,3136,6
9,education = HS-grad,Categorical,,,,,,HS-grad,0,0,15784,32


In [None]:
from eda_toolkit import generate_table1

table1 = generate_table1(
    df=df,
    categorical_cols=["sex", "race", "workclass"],
    continuous_cols=["hours-per-week", "age", "education-num"],
    value_counts=False,
    max_categories=3,
    export_markdown=True,
    decimal_places=0,
    markdown_path="table1_summary.md",
)

print(table1)

 Variable       | Type        | Mean | SD | Median | Min | Max | Mode    | Missing (n) | Missing (%) | Count  | Proportion (%) 
----------------|-------------|------|----|--------|-----|-----|---------|-------------|-------------|--------|----------------
 hours-per-week | Continuous  | 40   | 12 | 40     | 1   | 99  | 40      | 0           | 0           | 48,842 | 100            
 age            | Continuous  | 39   | 14 | 37     | 17  | 90  | 36      | 0           | 0           | 48,842 | 100            
 education-num  | Continuous  | 10   | 3  | 10     | 1   | 16  | 9       | 0           | 0           | 48,842 | 100            
 sex            | Categorical |      |    |        |     |     | Male    | 0           | 0           | 48,842 | 100            
 race           | Categorical |      |    |        |     |     | White   | 0           | 0           | 48,842 | 100            
 workclass      | Categorical |      |    |        |     |     | Private | 963         | 2           | 4

## Generate Table 1

In [None]:
from eda_toolkit import generate_table1

# Get DataFrame and Markdown string
df1 = generate_table1(
    df,
    value_counts=True,
)

# Just get markdown string (no tuple)
md_only = generate_table1(df, export_markdown=True, return_markdown_only=True)

In [None]:
df1

In [None]:
# Save Markdown to file
generate_table1(
    df,
    value_counts=True,
    export_markdown=True,
    markdown_path="table1_summary.md",
)

In [None]:
# Get DataFrame and Markdown string
df2 = generate_table1(
    df,
    value_counts=True,
    include_types="continuous",
)

df2

## Add Ids

In [None]:
from eda_toolkit import add_ids

# Add a column of unique IDs with 9 digits and call it "census_id"
df = add_ids(
    df=df,
    id_colname="census_id",
    num_digits=9,
    seed=111,
    set_as_index=True,
)

df.head()

In [None]:
if df.index.is_unique:
    print("The index is unique.")
else:
    print("The index is not unique.")

## Trailing Period Removal

In [None]:
from eda_toolkit import strip_trailing_period

# Create a sample dataframe with trailing periods in some values
data = {
    "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
}
df_trail = pd.DataFrame(data)

# Remove trailing periods from the 'values' column
df_trail = strip_trailing_period(df=df_trail, column_name="values")
df_trail

## Standardized Dates

In [None]:
from eda_toolkit import parse_date_with_rule

# Sample date strings
date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]

# Standardize the date strings
standardized_dates = [parse_date_with_rule(date) for date in date_strings]

print(standardized_dates)

In [None]:
# Creating the DataFrame
data = {
    "date_column": [
        "31/12/2021",
        "01/01/2022",
        "12/31/2021",
        "13/02/2022",
        "07/04/2022",
    ],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
}

df_fake = pd.DataFrame(data)

# Apply the function to the DataFrame column
df_fake["standardized_date"] = df_fake["date_column"].apply(parse_date_with_rule)

print(df_fake)

## DataFrame Analysis

In [None]:
from eda_toolkit import dataframe_profiler

dataframe_profiler(df=df, background_color="brown")

## Binning Numerical Columns

In [None]:
bin_ages = [
    0,
    18,
    30,
    40,
    50,
    60,
    70,
    80,
    90,
    100,
    float("inf"),
]

In [None]:
label_ages = [
    "< 18",
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
    "100 +",
]

In [None]:
df["age_group"] = pd.cut(
    df["age"],
    bins=bin_ages,
    labels=label_ages,
    right=False,
)

## Generating Summary Tables for Variable Combinations

In [None]:
from eda_toolkit import summarize_all_combinations

# Define unique variables for the analysis
unique_vars = [
    "age_group",
    "workclass",
    "education",
    "occupation",
    "race",
    "sex",
    "income",
]

# Generate summary tables for all combinations of the specified variables
summary_tables, all_combinations = summarize_all_combinations(
    df=df,
    data_path=data_output,
    variables=unique_vars,
    data_name="census_summary_tables.xlsx",
)

# Print all combinations of variables
print(all_combinations)

## Saving DataFrames to Excel with Customized Formatting

In [None]:
from eda_toolkit import save_dataframes_to_excel

# Example usage
file_name = "df_census.xlsx"  # Name of the output Excel file
file_path = os.path.join(data_path, file_name)

# filter DataFrame to Ages 18-40
filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]

df_dict = {
    "original_df": df,
    "ages_18_to_40": filtered_df,
}

save_dataframes_to_excel(
    file_path=file_path,
    df_dict=df_dict,
    decimal_places=0,
)

## Creating Contingency Tables

In [None]:
from eda_toolkit import contingency_table

# Example usage
contingency_table(
    df=df,
    cols=[
        "age_group",
        "workclass",
        "race",
        "sex",
    ],
    sort_by=1,
)

## Highlighting Specific Columns in a DataFrame

In [None]:
from eda_toolkit import highlight_columns

# Applying the highlight function
highlighted_df = highlight_columns(
    df=df.head(),
    columns=["age", "education"],
    color="brown",
)

highlighted_df