## Import Requisite Lbraries

In [1]:
import pandas as pd
import numpy as np
import os

## Ensure Directory

In [2]:
from eda_toolkit import ensure_directory

import os  # import operating system for dir
import sys

from sklearn.metrics import roc_auc_score

base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to parent directory,
# then into the 'data' folder
data_path = os.path.join(os.pardir, "data")
data_output = os.path.join(os.pardir, "data_output")

# create image paths
image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure'data' directory exists
ensure_directory(data_path)
ensure_directory(data_output)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

Created directory: ../data
Created directory: ../data_output
Created directory: ../images/png_images
Created directory: ../images/svg_images


## UCI ML Repository

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# Combine X and y into entire df
df = X.join(y, how="inner")

df.to_csv(os.path.join(data_path, "adult_income.csv"))

## Add Ids

In [4]:
from eda_toolkit import add_ids

# Add a column of unique IDs with 9 digits and call it "census_id"
df = add_ids(
    df=df,
    id_colname="census_id",
    num_digits=9,
    seed=111,
    set_as_index=True,
)

df.head()

The DataFrame index is unique.


Unnamed: 0_level_0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
census_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
582248222,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
561810758,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
598098459,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
776705221,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
479262902,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
if df.index.is_unique:
    print("The index is unique.")
else:
    print("The index is not unique.")

The index is unique.


## Trailing Period Removal

In [6]:
from eda_toolkit import strip_trailing_period

# Create a sample dataframe with trailing periods in some values
data = {
    "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
}
df_trail = pd.DataFrame(data)

# Remove trailing periods from the 'values' column
df_trail = strip_trailing_period(df=df_trail, column_name="values")
df_trail

Unnamed: 0,values
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
5,6.0


## Standardized Dates

In [7]:
from eda_toolkit import parse_date_with_rule

# Sample date strings
date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]

# Standardize the date strings
standardized_dates = [parse_date_with_rule(date) for date in date_strings]

print(standardized_dates)

['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']


In [8]:
# Creating the DataFrame
data = {
    "date_column": [
        "31/12/2021",
        "01/01/2022",
        "12/31/2021",
        "13/02/2022",
        "07/04/2022",
    ],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
}

df_fake = pd.DataFrame(data)

# Apply the function to the DataFrame column
df_fake["standardized_date"] = df_fake["date_column"].apply(parse_date_with_rule)

print(df_fake)

  date_column     name  amount standardized_date
0  31/12/2021    Alice  100.00        2021-12-31
1  01/01/2022      Bob  150.50        2022-01-01
2  12/31/2021  Charlie  200.75        2021-12-31
3  13/02/2022    David  250.25        2022-02-13
4  07/04/2022      Eve  300.00        2022-04-07


## DataFrame Analysis

In [9]:
from eda_toolkit import dataframe_columns

dataframe_columns(df=df, background_color="brown")

Shape:  (48842, 15) 



Processing columns: 100%|██████████| 15/15 [00:00<00:00, 74.38it/s]


Total seconds of processing time: 0.351102






column,dtype,null_total,null_pct,unique_values_total,max_unique_value,max_unique_value_total,max_unique_value_pct
age,int64,0,0.0,74,36,1348,2.76
workclass,object,963,1.97,9,Private,33906,69.42
fnlwgt,int64,0,0.0,28523,203488,21,0.04
education,object,0,0.0,16,HS-grad,15784,32.32
education-num,int64,0,0.0,16,9,15784,32.32
marital-status,object,0,0.0,7,Married-civ-spouse,22379,45.82
occupation,object,966,1.98,15,Prof-specialty,6172,12.64
relationship,object,0,0.0,6,Husband,19716,40.37
race,object,0,0.0,5,White,41762,85.5
sex,object,0,0.0,2,Male,32650,66.85


## Binning Numerical Columns

In [10]:
bin_ages = [
    0,
    18,
    30,
    40,
    50,
    60,
    70,
    80,
    90,
    100,
    float("inf"),
]

In [11]:
label_ages = [
    "< 18",
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
    "100 +",
]

In [12]:
df["age_group"] = pd.cut(
    df["age"],
    bins=bin_ages,
    labels=label_ages,
    right=False,
)

## Generating Summary Tables for Variable Combinations

In [13]:
from eda_toolkit import summarize_all_combinations

# Define unique variables for the analysis
unique_vars = [
    "age_group",
    "workclass",
    "education",
    "occupation",
    "race",
    "sex",
    "income",
]

# Generate summary tables for all combinations of the specified variables
summary_tables, all_combinations = summarize_all_combinations(
    df=df,
    data_path=data_output,
    variables=unique_vars,
    data_name="census_summary_tables.xlsx",
)

# Print all combinations of variables
print(all_combinations)

Generating combinations: 100%|██████████| 120/120 [00:01<00:00, 76.56it/s]
Writing summary tables: 100%|██████████| 120/120 [00:41<00:00,  2.87it/s]
Finalizing Excel file: 100%|██████████| 1/1 [00:00<00:00, 13706.88it/s]

Data saved to ../data_output/census_summary_tables.xlsx
[('age_group', 'workclass'), ('age_group', 'education'), ('age_group', 'occupation'), ('age_group', 'race'), ('age_group', 'sex'), ('age_group', 'income'), ('workclass', 'education'), ('workclass', 'occupation'), ('workclass', 'race'), ('workclass', 'sex'), ('workclass', 'income'), ('education', 'occupation'), ('education', 'race'), ('education', 'sex'), ('education', 'income'), ('occupation', 'race'), ('occupation', 'sex'), ('occupation', 'income'), ('race', 'sex'), ('race', 'income'), ('sex', 'income'), ('age_group', 'workclass', 'education'), ('age_group', 'workclass', 'occupation'), ('age_group', 'workclass', 'race'), ('age_group', 'workclass', 'sex'), ('age_group', 'workclass', 'income'), ('age_group', 'education', 'occupation'), ('age_group', 'education', 'race'), ('age_group', 'education', 'sex'), ('age_group', 'education', 'income'), ('age_group', 'occupation', 'race'), ('age_group', 'occupation', 'sex'), ('age_group', 'oc




## Saving DataFrames to Excel with Customized Formatting

In [14]:
from eda_toolkit import save_dataframes_to_excel

# Example usage
file_name = "df_census.xlsx"  # Name of the output Excel file
file_path = os.path.join(data_path, file_name)

# filter DataFrame to Ages 18-40
filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]

df_dict = {
    "original_df": df,
    "ages_18_to_40": filtered_df,
}

save_dataframes_to_excel(
    file_path=file_path,
    df_dict=df_dict,
    decimal_places=0,
)

Saving DataFrames: 100%|██████████| 2/2 [00:08<00:00,  4.34s/it]


DataFrames saved to ../data/df_census.xlsx


## Creating Contingency Tables

In [15]:
from eda_toolkit import contingency_table

# Example usage
contingency_table(
    df=df,
    cols=[
        "age_group",
        "workclass",
        "race",
        "sex",
    ],
    sort_by=1,
)

Unnamed: 0,age_group,workclass,race,sex,Total,Percentage
0,30-39,Private,White,Male,5856,11.99
1,18-29,Private,White,Male,5623,11.51
2,40-49,Private,White,Male,4267,8.74
3,18-29,Private,White,Female,3680,7.53
4,50-59,Private,White,Male,2565,5.25
...,...,...,...,...,...,...
467,60-69,Local-gov,Asian-Pac-Islander,Female,1,0.00
468,70-79,State-gov,Black,Female,1,0.00
469,70-79,State-gov,Asian-Pac-Islander,Female,1,0.00
470,50-59,Federal-gov,Other,Male,1,0.00


## Highlighting Specific Columns in a DataFrame

In [16]:
from eda_toolkit import highlight_columns

# Applying the highlight function
highlighted_df = highlight_columns(
    df=df.head(),
    columns=["age", "education"],
    color="brown",
)

highlighted_df

Unnamed: 0_level_0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_group
census_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
582248222,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,30-39
561810758,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,50-59
598098459,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,30-39
776705221,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,50-59
479262902,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,18-29
