In [26]:
import pandas as pd
import numpy as np

### Dictionaries

In [27]:
rename_dict = {
    'occupation': 'collars',
    'workclass': 'workclass_recod',
    'education': 'educ_recod',
    'marital-status': 'civstatus',
    'native-country': 'region'
}

occupation_dict = {
    'Prof-specialty': 'white-collar', 
    'Exec-managerial': 'white-collar', 
    'Adm-clerical': 'white-collar', 
    'Sales': 'white-collar', 
    'Tech-support': 'white-collar',
    'Craft-repair': 'blue-collar', 
    'Machine-op-inspct': 'blue-collar', 
    'Transport-moving': 'blue-collar', 
    'Handlers-cleaners': 'blue-collar', 
    'Farming-fishing': 'blue-collar', 
    'Protective-serv': 'blue-collar', 
    'Priv-house-serv': 'blue-collar',
    'Other-service': 'others',
    'Armed-Forces': 'others'
}

workclass_dict = {
    'Federal-gov': 'federal-gov',
    'State-gov': 'state-level-gov', 
    'Local-gov': 'state-level-gov',
    'Self-emp-inc': 'self-employed', 
    'Self-emp-not-inc': 'self-employed',
    'Never-worked': 'unemployed', 
    'Without-pay': 'unemployed',
    'Private': 'private'
}

education_dict = {
    'Preschool': 'preschool',
    '1st-4th': 'elementary-school', 
    '5th-6th': 'elementary-school',
    '7th-8th': 'high-school', 
    '9th': 'high-school',
    '10th': 'high-school',
    '11th': 'high-school',
    '12th': 'high-school',
    'HS-grad': 'high-school',
    'Assoc-voc': 'college', 
    'Assoc-acdm': 'college', 
    'Some-college': 'college',
    'Bachelors': 'university', 
    'Masters': 'university',  
    'Prof-school': 'university', 
    'Doctorate': 'university'
}

marital_dict = {
    'Married-civ-spouse': 'married', 
    'Married-spouse-absent': 'married',
    'Married-AF-spouse': 'married',
    'Divorced': 'divorced',
    'Separated': 'separated',
    'Widowed': 'widowed',
    'Never-married': 'never-married'
}

native_country_dict = {
    'United-States': 'America', 
    '?': 'Unknown',
    'Peru': 'America',
    'Guatemala': 'America', 
    'Mexico': 'America',
    'Dominican-Republic': 'America',
    'Ireland': 'Europe',
    'Germany': 'Europe',
    'Philippines': 'Asia',
    'Thailand': 'Asia',
    'Haiti': 'America',
    'El-Salvador': 'America', 
    'Puerto-Rico': 'America',
    'Vietnam': 'Asia',
    'South': 'Unknown',
    'Columbia': 'America',
    'Japan': 'Asia',
    'India': 'Asia',
    'Cambodia': 'Asia',
    'Poland': 'Europe',
    'Laos': 'Asia',
    'England': 'Europe',
    'Cuba': 'America',
    'Taiwan': 'Asia',
    'Italy': 'Europe',
    'Canada': 'America',
    'Portugal': 'Europe',
    'China': 'Asia',
    'Nicaragua': 'America', 
    'Honduras': 'America',
    'Iran': 'Asia',
    'Scotland': 'Europe', 
    'Jamaica': 'America',
    'Ecuador': 'America',
    'Yugoslavia': 'Europe',
    'Hungary': 'Europe',
    'Hong': 'Asia',
    'Greece': 'Europe',
    'Trinadad&Tobago': 'America',
    'Outlying-US(Guam-USVI-etc)': 'Unknown',
    'France': 'Europe',
    'Holand-Netherlands': 'Europe'
}

In [28]:
# importación de datos
df = pd.read_csv('income-db.csv')

In [29]:
# reemplazo de valores nulos / caracteres incorrectos
df['occupation'] = df['occupation'].replace('?', np.nan)
df['workclass'] = df['workclass'].replace('?', np.nan)

In [30]:
# reemplazo en columnas con diccionarios
df['occupation'] = df['occupation'].replace(occupation_dict)
df['workclass'] = df['workclass'].replace(workclass_dict)
df['education'] = df['education'].replace(education_dict)
df['marital-status'] = df['marital-status'].replace(marital_dict)
df['native-country'] = df['native-country'].replace(native_country_dict)

In [31]:
rename_dict = {
    'occupation': 'collars',
    'workclass': 'workclass_recod',
    'education': 'educ_recod',
    'marital-status': 'civstatus',
    'native-country': 'region'
}

In [32]:
# renaming columns
df.rename(columns=rename_dict, inplace=True)

In [33]:
# recodificando income
df['income'] = df['income'].replace(['<=50K', '>50K'], [0, 1])

In [34]:
df.head()

Unnamed: 0,age,workclass_recod,fnlwgt,educ_recod,educational-num,civstatus,collars,relationship,race,gender,capital-gain,capital-loss,hours-per-week,region,income
0,25,private,226802,high-school,7,never-married,blue-collar,Own-child,Black,Male,0,0,40,America,0
1,38,private,89814,high-school,9,married,blue-collar,Husband,White,Male,0,0,50,America,0
2,28,state-level-gov,336951,college,12,married,blue-collar,Husband,White,Male,0,0,40,America,1
3,44,private,160323,college,10,married,blue-collar,Husband,Black,Male,7688,0,40,America,1
4,18,,103497,college,10,never-married,,Own-child,White,Female,0,0,30,America,0


In [35]:
# saving as clean data
df.to_csv('clean_income.csv', index=False)