In [1]:
import pandas as pd
import numpy as np

### Data Loading

In [2]:
adult_data = pd.read_csv("/home/mike/datasets/archive (1)/adult.data.csv")

In [3]:
adult_data.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


#### Display column names

In [4]:
adult_data.columns

Index(['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0',
       ' 40', ' United-States', ' <=50K'],
      dtype='object')

#### Rename Columns

In [5]:
adult_data.rename(columns = {"39":"Age", ' Not-in-family':'Relationship', " White":"Race", " Male":"Gender", " Never-married":"Marital-status", 
                             " United-States":"Native-country", " 77516":"fnlwgt", " State-gov":"workclass", 
                             " Bachelors":"Education", " 13":"Education-num", " Adm-clerical":"Ocupation", " 2174":"Capital-gain", " 0":"capital-loss", 
                            " 40":"Hours-per-week", " <=50K":"Label"},
                  inplace = True)

In [6]:
adult_data.columns

Index(['Age', 'workclass', 'fnlwgt', 'Education', 'Education-num',
       'Marital-status', 'Ocupation', 'Relationship', 'Race', 'Gender',
       'Capital-gain', 'capital-loss', 'Hours-per-week', 'Native-country',
       'Label'],
      dtype='object')

#### Get data type info

In [7]:
adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32560 non-null  int64 
 1   workclass       32560 non-null  object
 2   fnlwgt          32560 non-null  int64 
 3   Education       32560 non-null  object
 4   Education-num   32560 non-null  int64 
 5   Marital-status  32560 non-null  object
 6   Ocupation       32560 non-null  object
 7   Relationship    32560 non-null  object
 8   Race            32560 non-null  object
 9   Gender          32560 non-null  object
 10  Capital-gain    32560 non-null  int64 
 11  capital-loss    32560 non-null  int64 
 12  Hours-per-week  32560 non-null  int64 
 13  Native-country  32560 non-null  object
 14  Label           32560 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


#### Convert Label column data values

In [8]:
def convert_label(label):
    if label == "<=50K":
        label = "below"
    else:
        label = "above"
    return label

In [9]:
adult_data.Label = adult_data.Label.apply(convert_label)

#### Convert Categorical data type

In [10]:
categorical_types = ["workclass","Education", "Marital-status", "Ocupation", "Race", "Gender", "Relationship", "Native-country", "Label"]

In [11]:
adult_data[categorical_types].head()

Unnamed: 0,workclass,Education,Marital-status,Ocupation,Race,Gender,Relationship,Native-country,Label
0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,Husband,United-States,above
1,Private,HS-grad,Divorced,Handlers-cleaners,White,Male,Not-in-family,United-States,above
2,Private,11th,Married-civ-spouse,Handlers-cleaners,Black,Male,Husband,United-States,above
3,Private,Bachelors,Married-civ-spouse,Prof-specialty,Black,Female,Wife,Cuba,above
4,Private,Masters,Married-civ-spouse,Exec-managerial,White,Female,Wife,United-States,above


In [12]:
adult_data[categorical_types] = adult_data[categorical_types].astype('category')

In [13]:
adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Age             32560 non-null  int64   
 1   workclass       32560 non-null  category
 2   fnlwgt          32560 non-null  int64   
 3   Education       32560 non-null  category
 4   Education-num   32560 non-null  int64   
 5   Marital-status  32560 non-null  category
 6   Ocupation       32560 non-null  category
 7   Relationship    32560 non-null  category
 8   Race            32560 non-null  category
 9   Gender          32560 non-null  category
 10  Capital-gain    32560 non-null  int64   
 11  capital-loss    32560 non-null  int64   
 12  Hours-per-week  32560 non-null  int64   
 13  Native-country  32560 non-null  category
 14  Label           32560 non-null  category
dtypes: category(9), int64(6)
memory usage: 1.8 MB


#### Check for null values

In [14]:
adult_data.isna().sum()

Age               0
workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital-status    0
Ocupation         0
Relationship      0
Race              0
Gender            0
Capital-gain      0
capital-loss      0
Hours-per-week    0
Native-country    0
Label             0
dtype: int64