### Complete Dataset analysis

* the dataset to be analyzed is provided as a Pandas DataFrame
* it can be easily loaded using Pandas Methods and in conjunction with ocifs from Object Storage

* you get for each cols: # missing, n zeros, cardinality, is_categorical, dtype adn some more information

In [1]:
import pandas as pd
from pandas.api.types import is_numeric_dtype

import os
import requests

### First dataset: Employee attrition, from Oracle

In [2]:
# I'm using this dataset for the example

URL = "https://objectstorage.us-ashburn-1.oraclecloud.com/n/bigdatadatasciencelarge/b/hosted-ds-datasets/o/synthetic%2Forcl_attrition.csv"

data_orig_df = pd.read_csv(URL)

data_orig_df.head()

Unnamed: 0,Age,Attrition,TravelForWork,SalaryLevel,JobFunction,CommuteLength,EducationalLevel,EducationField,Directs,EmployeeNumber,...,WeeklyWorkedHours,StockOptionLevel,YearsinIndustry,TrainingTimesLastYear,WorkLifeBalance,YearsOnJob,YearsAtCurrentLevel,YearsSinceLastPromotion,YearsWithCurrManager,name
0,42,Yes,infrequent,5054,Product Management,2,L2,Life Sciences,1,1,...,80,0,8,0,1,6,4,0,5,Tracy Moore
1,50,No,often,1278,Software Developer,9,L1,Life Sciences,1,2,...,80,1,10,3,3,10,7,1,7,Andrew Hoover
2,38,Yes,infrequent,6296,Software Developer,3,L2,Other,1,4,...,80,0,7,3,3,0,0,0,0,Julie Bell
3,34,No,often,6384,Software Developer,4,L4,Life Sciences,1,5,...,80,0,8,3,3,8,7,3,0,Thomas Adams
4,28,No,infrequent,2710,Software Developer,3,L1,Medical,1,7,...,80,1,6,3,3,2,2,2,2,Johnathan Burnett


In [3]:
def get_general_info(data_df):
    print(f"There are: {len(data_df.columns)} columns in the dataset")
    print()
    print(
        "The list of column names, in alphabetical order:",
        sorted(list(data_df.columns)),
    )
    print()
    print(f"There are {data_df.shape[0]} records in the dataset")
    print()
    
    return

In [4]:
# for each column:
# cardinality
# current datatype
# num of missing values
# can be categorical?

# parameters

# well you have to decide a threshold in term of a fraction
FRAC = 0.1

def analyze_df(data_df):
    # it is ok to use isna, isnull is an alias of isna
    missing_val = data_df.isna().sum()

    # cardinality

    THR = data_df.shape[0] * FRAC

    list_card = []
    list_cat = []
    list_dtypes = []
    list_num_zeros = []

    for col in data_df.columns:
        # count the # of distinct values
        n_distinct = data_df[col].nunique()
        list_card.append(n_distinct)
        
        # is categorical is decide on this rule
        if n_distinct < THR:
            # categorical
            list_cat.append("Yes")
        else:
            list_cat.append("No")

        list_dtypes.append(data_df[col].dtype)

        if is_numeric_dtype(data_df[col]):
            # round, in case it is float
            n_zeros = (data_df[col].round() == 0).sum()

            list_num_zeros.append(str(n_zeros))
        else:
            list_num_zeros.append("-")

    # build the results DF
    result_df = pd.DataFrame(
        {
            "col_name": list(data_df.columns),
            "missing_vals": missing_val,
            "num_zeros": list_num_zeros,
            "cardinality": list_card,
            "is_categorical": list_cat,
            "data_type": list_dtypes,
        },
        index=None,
    )

    # if you don't want cols as index
    result_df.reset_index(drop=True, inplace=True)

    return result_df

### Analyze the first dataset

In [5]:
get_general_info(data_orig_df)

There are: 36 columns in the dataset

The list of column names, in alphabetical order: ['Age', 'Attrition', 'CommuteLength', 'Directs', 'EducationField', 'EducationalLevel', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobFunction', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'SalaryLevel', 'StockOptionLevel', 'TrainingTimesLastYear', 'TravelForWork', 'WeeklyWorkedHours', 'WorkLifeBalance', 'YearsAtCurrentLevel', 'YearsOnJob', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'YearsinIndustry', 'name']

There are 1470 records in the dataset



In [6]:
# analyze and show the results (it returns dataframe)
analyze_df(data_orig_df)

Unnamed: 0,col_name,missing_vals,num_zeros,cardinality,is_categorical,data_type
0,Age,0,0,43,Yes,int64
1,Attrition,0,-,2,Yes,object
2,TravelForWork,0,-,3,Yes,object
3,SalaryLevel,0,0,886,No,int64
4,JobFunction,0,-,3,Yes,object
5,CommuteLength,0,0,29,Yes,int64
6,EducationalLevel,0,-,5,Yes,object
7,EducationField,0,-,6,Yes,object
8,Directs,0,0,1,Yes,int64
9,EmployeeNumber,0,0,1470,No,int64


### Another example: UCI Adult Dataset

In [7]:
def load_dataset(path, urls):
    if not os.path.exists(path):
        os.mkdir(path)

    for url in urls:
        data = requests.get(url).content
        filename = os.path.join(path, os.path.basename(url))
        with open(filename, "wb") as file:
            file.write(data)

In [8]:
urls = ["https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"]

# download
load_dataset('data', urls)

In [9]:
columns = ["age", "workClass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship",
          "race", "sex", "capital-gain", "capital-loss", 
           "hours-per-week", "native-country", "income"]

train_data = pd.read_csv('data/adult.data', names=columns, 
             sep=' *, *', na_values="?")

  return func(*args, **kwargs)


In [10]:
get_general_info(train_data)

There are: 15 columns in the dataset

The list of column names, in alphabetical order: ['age', 'capital-gain', 'capital-loss', 'education', 'education-num', 'fnlwgt', 'hours-per-week', 'income', 'marital-status', 'native-country', 'occupation', 'race', 'relationship', 'sex', 'workClass']

There are 32561 records in the dataset



In [11]:
# here we analyze:
analyze_df(train_data)

Unnamed: 0,col_name,missing_vals,num_zeros,cardinality,is_categorical,data_type
0,age,0,0,73,Yes,int64
1,workClass,1836,-,8,Yes,object
2,fnlwgt,0,0,21648,No,int64
3,education,0,-,16,Yes,object
4,education-num,0,0,16,Yes,int64
5,marital-status,0,-,7,Yes,object
6,occupation,1843,-,14,Yes,object
7,relationship,0,-,6,Yes,object
8,race,0,-,5,Yes,object
9,sex,0,-,2,Yes,object
