### Complete Dataset analysis

* the dataset to be analyzed is provided as a Pandas DataFrame
* it can be easily loaded using Pandas Methods and in conjunction with ocifs from Object Storage

* you get for each cols: # missing, n zeros, cardinality, is_categorical, dtype

In [1]:
import pandas as pd
from pandas.api.types import is_numeric_dtype

import os

In [2]:
# I'm using this dataset for the example

URL = "https://objectstorage.us-ashburn-1.oraclecloud.com/n/bigdatadatasciencelarge/b/hosted-ds-datasets/o/synthetic%2Forcl_attrition.csv"

data_orig_df = pd.read_csv(URL)

data_orig_df.head()

Unnamed: 0,Age,Attrition,TravelForWork,SalaryLevel,JobFunction,CommuteLength,EducationalLevel,EducationField,Directs,EmployeeNumber,...,WeeklyWorkedHours,StockOptionLevel,YearsinIndustry,TrainingTimesLastYear,WorkLifeBalance,YearsOnJob,YearsAtCurrentLevel,YearsSinceLastPromotion,YearsWithCurrManager,name
0,42,Yes,infrequent,5054,Product Management,2,L2,Life Sciences,1,1,...,80,0,8,0,1,6,4,0,5,Tracy Moore
1,50,No,often,1278,Software Developer,9,L1,Life Sciences,1,2,...,80,1,10,3,3,10,7,1,7,Andrew Hoover
2,38,Yes,infrequent,6296,Software Developer,3,L2,Other,1,4,...,80,0,7,3,3,0,0,0,0,Julie Bell
3,34,No,often,6384,Software Developer,4,L4,Life Sciences,1,5,...,80,0,8,3,3,8,7,3,0,Thomas Adams
4,28,No,infrequent,2710,Software Developer,3,L1,Medical,1,7,...,80,1,6,3,3,2,2,2,2,Johnathan Burnett


In [3]:
print(f"There are: {len(data_orig_df.columns)} columns in the dataset")
print()
print(
    "The list of column names, in alphabetical order:",
    sorted(list(data_orig_df.columns)),
)
print()
print(f"There are {data_orig_df.shape[0]} records in the dataset")
print()

There are: 36 columns in the dataset

The list of column names, in alphabetical order: ['Age', 'Attrition', 'CommuteLength', 'Directs', 'EducationField', 'EducationalLevel', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobFunction', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'SalaryLevel', 'StockOptionLevel', 'TrainingTimesLastYear', 'TravelForWork', 'WeeklyWorkedHours', 'WorkLifeBalance', 'YearsAtCurrentLevel', 'YearsOnJob', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'YearsinIndustry', 'name']

There are 1470 records in the dataset



In [4]:
# for each column:
# cardinality
# current datatype
# num of missing values
# can be categorical?

In [5]:
# parameters

# well you have to decide a threshold in term of a fraction
FRAC = 0.1


def analyze_df(data_df):
    missing_val = data_df.isna().sum() + data_df.isnull().sum()

    # cardinality

    THR = data_df.shape[0] * FRAC

    list_card = []
    list_cat = []
    list_dtypes = []
    list_num_zeros = []

    for col in data_df.columns:
        # count the # of distinct values
        n_distinct = data_df[col].nunique()
        list_card.append(n_distinct)
        
        # is categorical is decide on this rule
        if n_distinct < THR:
            # categorical
            list_cat.append("Yes")
        else:
            list_cat.append("No")

        list_dtypes.append(data_df[col].dtype)

        if is_numeric_dtype(data_df[col]):
            # round, in case it is float
            n_zeros = (data_df[col].round() == 0).sum()

            list_num_zeros.append(str(n_zeros))
        else:
            list_num_zeros.append("-")

    # build the results DF
    result_df = pd.DataFrame(
        {
            "col_name": list(data_df.columns),
            "missing_vals": missing_val,
            "num_zeros": list_num_zeros,
            "cardinality": list_card,
            "is_categorical": list_cat,
            "data_type": list_dtypes,
        },
        index=None,
    )

    # if you don't want cols as index
    result_df.reset_index(drop=True, inplace=True)

    return result_df

In [6]:
# call the function
result_df = analyze_df(data_orig_df)

In [7]:
# show the result
result_df

Unnamed: 0,col_name,missing_vals,num_zeros,cardinality,is_categorical,data_type
0,Age,0,0,43,Yes,int64
1,Attrition,0,-,2,Yes,object
2,TravelForWork,0,-,3,Yes,object
3,SalaryLevel,0,0,886,No,int64
4,JobFunction,0,-,3,Yes,object
5,CommuteLength,0,0,29,Yes,int64
6,EducationalLevel,0,-,5,Yes,object
7,EducationField,0,-,6,Yes,object
8,Directs,0,0,1,Yes,int64
9,EmployeeNumber,0,0,1470,No,int64
