In [12]:
#import basic libraries for cleaning the data
import pandas as pd
import numpy as np

def clean_an_adult_data_dataset(dataset_file_name): 
    print("---------------------")
    print(f"Cleaning {dataset_file_name} dataset...")
    #loading adult data
    adult_dataset_cols = ["age", "workclass", "fnlwgt", "education","education-num",
                          "marital-status","occupation","relationship","race","sex","capital-gain",
                          "capital-loss","hours-per-week","native-country", "salary"]
    adult_data = pd.read_csv(dataset_file_name, names = adult_dataset_cols)
    #converts salary into categorical 
    adult_data["salary"] = adult_data["salary"].astype("category")

    #1. Remove leading spaces from string columns
    for col in adult_data:
        if adult_data[col].dtypes == 'object' :
            adult_data[col] = adult_data[col].str.strip()

    #2. replace ? by NaN values
    adult_data = adult_data.replace({'?': np.nan})

    #Finding NAs in all columns
    num_rows_adult_data = adult_data.shape[0]
    print(f"In a total of {num_rows_adult_data}")
    for col in adult_data:
        quantity_of_na = adult_data[col].isna().sum()
        # Print only columns with NAs 
        if quantity_of_na:
            percent_of_na = round((quantity_of_na * 100) / num_rows_adult_data, 2)
            print(f"{col} has {quantity_of_na} NAs which is {percent_of_na}% of its rows.")


    #removing all NA rows since it's a very small percentage of a large dataset
    #Finding NAs in all columns
    adult_data_without_NAs = adult_data.dropna()
    num_rows_adult_data = adult_data_without_NAs.shape[0]
    print(f"There are {num_rows_adult_data} rows left after removing NAs")

    #Creating CSV out of dataframe 
    adult_data_without_NAs.to_csv(dataset_file_name+'.csv', index=False)
    print("---------------------")
clean_an_adult_data_dataset("adult.data")
clean_an_adult_data_dataset("adult.test")


---------------------
Cleaning adult.data dataset...
In a total of 32561
workclass has 1836 NAs which is 5.64% of its rows.
occupation has 1843 NAs which is 5.66% of its rows.
native-country has 583 NAs which is 1.79% of its rows.
There are 30162 rows left after removing NAs
---------------------
---------------------
Cleaning adult.test dataset...
In a total of 16282
workclass has 964 NAs which is 5.92% of its rows.
fnlwgt has 1 NAs which is 0.01% of its rows.
education has 1 NAs which is 0.01% of its rows.
education-num has 1 NAs which is 0.01% of its rows.
marital-status has 1 NAs which is 0.01% of its rows.
occupation has 967 NAs which is 5.94% of its rows.
relationship has 1 NAs which is 0.01% of its rows.
race has 1 NAs which is 0.01% of its rows.
sex has 1 NAs which is 0.01% of its rows.
capital-gain has 1 NAs which is 0.01% of its rows.
capital-loss has 1 NAs which is 0.01% of its rows.
hours-per-week has 1 NAs which is 0.01% of its rows.
native-country has 275 NAs which is 1.6