### Imports

In [None]:
import pandas as pd
import numpy as np

### Definindo DataFrame

In [None]:
df = pd.read_csv('adult.data.csv')

### Verificando Tamanhos

In [None]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educationnum',
       'maritalstatus', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'nativecountry',
       'result'],
      dtype='object')

In [None]:
df.shape

(32561, 15)

## **Pré-Processamentos**

Retirando education da tabela

In [None]:
df.drop(['education'], axis=1, inplace=True)

In [None]:
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [None]:
df.shape

(32561, 14)

### Removendo os dados faltantes

In [None]:
df['occupation'].unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

In [None]:
df['occupation'].value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

In [None]:
df['occupation'].nunique()

15

In [None]:
df_remove = df.loc[(df['occupation'] == '?')]

In [None]:
df_remove

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
27,54,?,180211,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,?,293936,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
69,25,?,200681,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,?,212759,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
106,17,?,304873,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32531,30,?,33811,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,?,287372,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32541,41,?,202822,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [None]:
df= df.drop(df_remove.index)

In [None]:
df['occupation'].nunique()

14

In [None]:
df.shape

(30718, 14)

In [None]:
df['occupation'].value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

Verificar colunas em object ou int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30718 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30718 non-null  int64 
 1   workclass       30718 non-null  object
 2   fnlwgt          30718 non-null  int64 
 3   educationnum    30718 non-null  int64 
 4   maritalstatus   30718 non-null  object
 5   occupation      30718 non-null  object
 6   relationship    30718 non-null  object
 7   race            30718 non-null  object
 8   sex             30718 non-null  object
 9   capital-gain    30718 non-null  int64 
 10  capital-loss    30718 non-null  int64 
 11  hours-per-week  30718 non-null  int64 
 12  nativecountry   30718 non-null  object
 13  result          30718 non-null  object
dtypes: int64(6), object(8)
memory usage: 3.5+ MB


## Utilizando **map** para conversão dos objetos

Verificando os dados

In [None]:
df['sex'].unique()

array(['Male', 'Female'], dtype=object)

In [None]:
df['sex'].value_counts()

Male      20788
Female     9930
Name: sex, dtype: int64

Rodando a função map para subistituir Objetos por números Inteiros

In [None]:
df['sex'] = df.sex.map({'Female':2, 'Male':1})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,<=50K


In [None]:
df['result'] = df.result.map({'<=50K':1, '>50K':2})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,1
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,1


In [None]:
df['nativecountry'].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany', 'Iran',
       'Philippines', 'Poland', 'Columbia', 'Cambodia', 'Thailand',
       'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'Italy', 'China', 'South', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [None]:
df['nativecountry'].value_counts()

United-States                 27504
Mexico                          610
?                               556
Philippines                     188
Germany                         128
Puerto-Rico                     109
Canada                          107
El-Salvador                     100
India                           100
Cuba                             92
England                          86
Jamaica                          80
South                            71
China                            68
Italy                            68
Dominican-Republic               67
Vietnam                          64
Guatemala                        63
Japan                            59
Columbia                         56
Poland                           56
Iran                             42
Haiti                            42
Taiwan                           42
Portugal                         34
Nicaragua                        33
Peru                             30
Greece                      

In [None]:
df_remove_native = df.loc[(df['nativecountry'] == '?')]

In [None]:
df_remove_native

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
14,40,Private,121772,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,1,0,0,40,?,2
38,31,Private,84154,10,Married-civ-spouse,Sales,Husband,White,1,0,0,38,?,2
51,18,Private,226956,9,Never-married,Other-service,Own-child,White,2,0,0,30,?,1
93,30,Private,117747,9,Married-civ-spouse,Sales,Wife,Asian-Pac-Islander,2,0,1573,35,?,1
245,56,Private,203580,9,Married-civ-spouse,Adm-clerical,Husband,White,1,0,0,35,?,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32413,45,Private,199590,3,Married-civ-spouse,Machine-op-inspct,Husband,White,1,0,0,40,?,1
32449,44,Self-emp-inc,71556,14,Married-civ-spouse,Sales,Husband,White,1,0,0,50,?,2
32469,58,Self-emp-inc,181974,16,Never-married,Prof-specialty,Not-in-family,White,2,0,0,99,?,1
32492,42,Self-emp-not-inc,217597,9,Divorced,Sales,Own-child,White,1,0,0,50,?,1


In [None]:
df= df.drop(df_remove_native.index)

In [None]:
df['nativecountry'].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', 'Mexico',
       'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany', 'Iran',
       'Philippines', 'Poland', 'Columbia', 'Cambodia', 'Thailand',
       'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'Italy', 'China', 'South', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [None]:
df['nativecountry'].sort_values().unique()

array(['Cambodia', 'Canada', 'China', 'Columbia', 'Cuba',
       'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England',
       'France', 'Germany', 'Greece', 'Guatemala', 'Haiti',
       'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India',
       'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico',
       'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines',
       'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan',
       'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam',
       'Yugoslavia'], dtype=object)

In [None]:
df['nativecountry'] = df.nativecountry.map({'Cambodia':1, 'Canada':2, 'China':3, 'Columbia':4, 'Cuba':5, 'Dominican-Republic':6, 'Ecuador':7, 'El-Salvador':8, 'England':9, 'France':10, 'Germany':11, 'Greece':12, 'Guatemala':13, 'Haiti':14, 'Holand-Netherlands':15, 'Honduras':16, 'Hong':17, 'Hungary':18, 'India':19, 'Iran':20, 'Ireland':12, 'Italy':22, 'Jamaica':23, 'Japan':24, 'Laos':25, 'Mexico':26, 'Nicaragua':27, 'Outlying-US(Guam-USVI-etc)':28, 'Peru':29, 'Philippines':30, 'Poland':31, 'Portugal':32, 'Puerto-Rico':33, 'Scotland':34, 'South':35, 'Taiwan':36, 'Thailand':37, 'Trinadad&Tobago':38, 'United-States':39, 'Vietnam':40, 'Yugoslavia':41})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,39,1
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,39,1


In [None]:
df['race'].sort_values().unique()

array(['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other',
       'White'], dtype=object)

In [None]:
df['race'] = df.race.map({'Amer-Indian-Eskimo':1, 'Asian-Pac-Islander':2, 'Black':3, 'Other':4, 'White':5})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,5,1,2174,0,40,39,1
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,5,1,0,0,13,39,1


In [None]:
df['relationship'].sort_values().unique()

array(['Husband', 'Not-in-family', 'Other-relative', 'Own-child',
       'Unmarried', 'Wife'], dtype=object)

In [None]:
df['relationship'] = df.relationship.map({'Husband':1, 'Not-in-family':2, 'Other-relative':3, 'Own-child':4, 'Unmarried':5, 'Wife':6})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,State-gov,77516,13,Never-married,Adm-clerical,2,5,1,2174,0,40,39,1
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,1,5,1,0,0,13,39,1


In [None]:
df['maritalstatus'].sort_values().unique()

array(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
       'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'],
      dtype=object)

In [None]:
df['maritalstatus'] = df.maritalstatus.map({'Divorced':1, 'Married-AF-spouse':2, 'Married-civ-spouse':3, 'Married-spouse-absent':4, 'Never-married':5, 'Separated':6, 'Widowed':7})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,State-gov,77516,13,5,Adm-clerical,2,5,1,2174,0,40,39,1
1,50,Self-emp-not-inc,83311,13,3,Exec-managerial,1,5,1,0,0,13,39,1


In [None]:
df['workclass'].sort_values().unique()

array(['Federal-gov', 'Local-gov', 'Private', 'Self-emp-inc',
       'Self-emp-not-inc', 'State-gov', 'Without-pay'], dtype=object)

In [None]:
df['workclass'] = df.workclass.map({'Federal-gov':1, 'Local-gov':2, 'Private':3, 'Self-emp-inc':4, 'Self-emp-not-inc':5, 'State-gov':6, 'Without-pay':7})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,6,77516,13,5,Adm-clerical,2,5,1,2174,0,40,39,1
1,50,5,83311,13,3,Exec-managerial,1,5,1,0,0,13,39,1


In [None]:
df['occupation'].sort_values().unique()

array(['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
       'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
       'Other-service', 'Priv-house-serv', 'Prof-specialty',
       'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'],
      dtype=object)

In [None]:
df['occupation'] = df.occupation.map({'Adm-clerical':1, 'Armed-Forces':2, 'Craft-repair':3, 'Exec-managerial':4, 'Farming-fishing':5, 'Handlers-cleaners':6, 'Machine-op-inspct':7, 'Other-service':8, 'Priv-house-serv':9, 'Prof-specialty':10, 'Protective-serv':11, 'Sales':12, 'Tech-support':13, 'Transport-moving':14})

In [None]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,6,77516,13,5,1,2,5,1,2174,0,40,39,1
1,50,5,83311,13,3,4,1,5,1,0,0,13,39,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             30162 non-null  int64
 1   workclass       30162 non-null  int64
 2   fnlwgt          30162 non-null  int64
 3   educationnum    30162 non-null  int64
 4   maritalstatus   30162 non-null  int64
 5   occupation      30162 non-null  int64
 6   relationship    30162 non-null  int64
 7   race            30162 non-null  int64
 8   sex             30162 non-null  int64
 9   capital-gain    30162 non-null  int64
 10  capital-loss    30162 non-null  int64
 11  hours-per-week  30162 non-null  int64
 12  nativecountry   30162 non-null  int64
 13  result          30162 non-null  int64
dtypes: int64(14)
memory usage: 3.5 MB


In [None]:
df.head(9)

Unnamed: 0,age,workclass,fnlwgt,educationnum,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,nativecountry,result
0,39,6,77516,13,5,1,2,5,1,2174,0,40,39,1
1,50,5,83311,13,3,4,1,5,1,0,0,13,39,1
2,38,3,215646,9,1,6,2,5,1,0,0,40,39,1
3,53,3,234721,7,3,6,1,3,1,0,0,40,39,1
4,28,3,338409,13,3,10,6,3,2,0,0,40,5,1
5,37,3,284582,14,3,4,6,5,2,0,0,40,39,1
6,49,3,160187,5,4,8,2,3,2,0,0,16,23,1
7,52,5,209642,9,3,4,1,5,1,0,0,45,39,2
8,31,3,45781,14,5,10,2,5,2,14084,0,50,39,2
