## Import Requisite Lbraries

In [1]:
import pandas as pd
import numpy as np
import os

## Ensure Directory

In [2]:
from eda_toolkit import ensure_directory

import os  # import operating system for dir

base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to parent directory,
# then into the 'data' folder
data_path = os.path.join(os.pardir, "data")
data_output = os.path.join(os.pardir, "data_output")

# create image paths
image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure'data' directory exists
ensure_directory(data_path)
ensure_directory(data_output)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

Directory exists: ../data
Directory exists: ../data_output
Directory exists: ../images/png_images
Directory exists: ../images/svg_images


## UCI ML Repository

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# Combine X and y into entire df
df = X.join(y, how="inner")

df.to_csv(os.path.join(data_path, "adult_income.csv"))

In [4]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [6]:
from eda_toolkit import generate_table1, table1_to_str

table1 = generate_table1(
    df=df,
    categorical_cols=["sex", "race", "workclass"],
    continuous_cols=["hours-per-week", "age", "education-num"],
    value_counts=True,
    max_categories=3,
    export_markdown=True,
    markdown_path="table1_summary.md",
)

print(table1_to_str(table1, padding=1))

 Variable           | Type        | Mean  | SD    | Median | Min | Max | Mode    | Missing (n) | Missing (%) | Count | Proportion (%) 
--------------------|-------------|-------|-------|--------|-----|-----|---------|-------------|-------------|-------|----------------
 hours-per-week     | Continuous  | 40.42 | 12.39 | 40.00  | 1   | 99  | 40      | 0           | 0.00        | 48842 | 100.00         
 age                | Continuous  | 38.64 | 13.71 | 37.00  | 17  | 90  | 36      | 0           | 0.00        | 48842 | 100.00         
 education-num      | Continuous  | 10.08 | 2.57  | 10.00  | 1   | 16  | 9       | 0           | 0.00        | 48842 | 100.00         
 sex = Male         | Categorical |       |       |        |     |     | Male    | 0           | 0.00        | 32650 | 66.85          
 sex = Female       | Categorical |       |       |        |     |     | Male    | 0           | 0.00        | 16192 | 33.15          
 race = White       | Categorical |       |       |    

In [None]:
from eda_toolkit import generate_table1

table1 = generate_table1(
    df=df,
    categorical_cols=["sex", "race", "workclass"],
    continuous_cols=["hours-per-week", "age", "education-num"],
    value_counts=True,
    max_categories=3,
    export_markdown=True,
    markdown_path="table1_summary.md",
)

print(table1_to_str(table1, padding=1))

## Add Ids

In [None]:
from eda_toolkit import add_ids

# Add a column of unique IDs with 9 digits and call it "census_id"
df = add_ids(
    df=df,
    id_colname="census_id",
    num_digits=9,
    seed=111,
    set_as_index=True,
)

df.head()

In [None]:
if df.index.is_unique:
    print("The index is unique.")
else:
    print("The index is not unique.")

## Trailing Period Removal

In [None]:
from eda_toolkit import strip_trailing_period

# Create a sample dataframe with trailing periods in some values
data = {
    "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
}
df_trail = pd.DataFrame(data)

# Remove trailing periods from the 'values' column
df_trail = strip_trailing_period(df=df_trail, column_name="values")
df_trail

## Standardized Dates

In [None]:
from eda_toolkit import parse_date_with_rule

# Sample date strings
date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]

# Standardize the date strings
standardized_dates = [parse_date_with_rule(date) for date in date_strings]

print(standardized_dates)

In [None]:
# Creating the DataFrame
data = {
    "date_column": [
        "31/12/2021",
        "01/01/2022",
        "12/31/2021",
        "13/02/2022",
        "07/04/2022",
    ],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
}

df_fake = pd.DataFrame(data)

# Apply the function to the DataFrame column
df_fake["standardized_date"] = df_fake["date_column"].apply(parse_date_with_rule)

print(df_fake)

## DataFrame Analysis

In [None]:
from eda_toolkit import dataframe_profiler

dataframe_profiler(df=df, background_color="brown")

## Binning Numerical Columns

In [None]:
bin_ages = [
    0,
    18,
    30,
    40,
    50,
    60,
    70,
    80,
    90,
    100,
    float("inf"),
]

In [None]:
label_ages = [
    "< 18",
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
    "100 +",
]

In [None]:
df["age_group"] = pd.cut(
    df["age"],
    bins=bin_ages,
    labels=label_ages,
    right=False,
)

## Generating Summary Tables for Variable Combinations

In [None]:
from eda_toolkit import summarize_all_combinations

# Define unique variables for the analysis
unique_vars = [
    "age_group",
    "workclass",
    "education",
    "occupation",
    "race",
    "sex",
    "income",
]

# Generate summary tables for all combinations of the specified variables
summary_tables, all_combinations = summarize_all_combinations(
    df=df,
    data_path=data_output,
    variables=unique_vars,
    data_name="census_summary_tables.xlsx",
)

# Print all combinations of variables
print(all_combinations)

## Saving DataFrames to Excel with Customized Formatting

In [None]:
from eda_toolkit import save_dataframes_to_excel

# Example usage
file_name = "df_census.xlsx"  # Name of the output Excel file
file_path = os.path.join(data_path, file_name)

# filter DataFrame to Ages 18-40
filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]

df_dict = {
    "original_df": df,
    "ages_18_to_40": filtered_df,
}

save_dataframes_to_excel(
    file_path=file_path,
    df_dict=df_dict,
    decimal_places=0,
)

## Creating Contingency Tables

In [None]:
from eda_toolkit import contingency_table

# Example usage
contingency_table(
    df=df,
    cols=[
        "age_group",
        "workclass",
        "race",
        "sex",
    ],
    sort_by=1,
)

## Highlighting Specific Columns in a DataFrame

In [None]:
from eda_toolkit import highlight_columns

# Applying the highlight function
highlighted_df = highlight_columns(
    df=df.head(),
    columns=["age", "education"],
    color="brown",
)

highlighted_df