In [1]:
from src.pre_procesamiento.carga_de_datos import cargar_datos_train_test

import pandas as pd

In [2]:
datos_train, datos_test = cargar_datos_train_test()
print(datos_train.shape, datos_test.shape)

(32561, 15) (16281, 15)


In [3]:
datos = pd.concat([datos_train, datos_test], ignore_index = True)
datos

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [4]:
datos.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object

In [5]:
"""
Como los valores númericos han sido cargados correctamente,
nos enfocamos en las variables categóricas.

Esto, teniendo como referencia su descripción en
archivo 'census+income/adult.names' 
"""
VARS_CATEGORICAS = [
  'workclass',         
  'education',         
  'marital-status',
  'occupation',        
  'relationship',      
  'race',              
  'sex',               
  'native-country',
  'class'
]


for var_cat in VARS_CATEGORICAS:
  print(datos[var_cat].value_counts(), '\n')

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64 

education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64 

marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64 

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial

In [6]:
"""
Del resultado previo, notamos que, como se especifica en el archivo
'census+income/adult.names', los valores perdidos se han reemplazado
por '?'.

Además, la variable categórica 'class' consiste de cuatro valores 
únicos, "<=50K", "<=50K.", ">50K", ">50K.", 
pese a que tal variable se definió como una variable binaria.
"""
# Reemplazamos '?' por valor vacío
for var_cat in VARS_CATEGORICAS:
  datos.loc[:, var_cat] = datos[var_cat].replace({'?' : None})

# Establecemos dos valores únicos para la variable 'class'
datos.loc[:, 'class'] = datos['class'].replace({
  '<=50K.' : '<=50K',  
  '>50K.' : '>50K'
})

datos['class'].value_counts()

class
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [7]:
"""
Valores perdidos
"""
vacios = (datos
  .isna().sum()
  .to_frame()
  .sort_values(0, ascending = True)
  .set_axis(['vacios'], axis = 'columns')
)
vacios['vacios%'] = round(100 * vacios['vacios'] / datos.shape[0], 2)
vacios

Unnamed: 0,vacios,vacios%
age,0,0.0
fnlwgt,0,0.0
education,0,0.0
education-num,0,0.0
marital-status,0,0.0
relationship,0,0.0
race,0,0.0
sex,0,0.0
capital-gain,0,0.0
capital-loss,0,0.0


In [8]:
"""
Solo las variables 'native-country', 'workclass' y 'occupation'
presenetan valores perdidos. Además, su porcentaje de valores
perdidos es pequeño, menor incluso que 6% .
"""

"\nSolo las variables 'native-country', 'workclass' y 'occupation'\npresenetan valores perdidos. Además, su porcentaje de valores\nperdidos es pequeño, menor incluso que 6% .\n"

In [9]:
"""
Para el análisis descriptivo, guardamos los datos limpios, 
con vacíos, tanto de entrenamiento como de test
"""
datos_train_limpios_con_vacios = datos.iloc[0:datos_train.shape[0]]
print(datos_train_limpios_con_vacios.shape[0])
print(datos_train.shape[0])

datos_train_limpios_con_vacios.to_csv(
  'data/datos_train_limpios_con_vacios.csv',
  index = False
)

32561
32561


In [10]:
datos_test_limpios_con_vacios = datos.iloc[datos_train.shape[0]:]
print(datos_test_limpios_con_vacios.shape[0])
print(datos_test.shape[0])

datos_test_limpios_con_vacios.to_csv(
  'data/datos_test_limpios_con_vacios.csv',
  index = False
)

16281
16281
