In [12]:
import pandas as pd
import numpy as np
import pickle

In [13]:
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go

In [14]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import XGBClassifier

In [15]:
from scipy.stats import mode

In [16]:
df = pd.read_csv('limpeza.csv')
print(df.columns)

Index(['id', 'uf', 'idade', 'sexo', 'etnia', 'n_cn', 'n_ch', 'n_lc', 'n_mt',
       'n_r1', 'n_r2', 'n_r3', 'n_r4', 'n_r5', 'n_r', 'escolaridade_pai',
       'escolaridade_mae', 'emprego_pai', 'emprego_mae', 'pessoas_casa',
       'renda', 'quartos', 'carros', 'motos', 'celulares', 'computadores',
       'internet', 'cidade_nat', 'uf_nat', 'media', 'escola'],
      dtype='object')


## Categorical to Numeric

In [17]:
categories = ['escolaridade_pai', 'escolaridade_mae', 'emprego_pai', 'emprego_mae', 'renda', 'quartos', 'carros', 'motos', 'celulares', 'computadores', 'internet']
print('Dropping NA...')
df = df.dropna()
print('Done\n')
for cat in categories:
    print('Category', cat)
    df[cat] = df[cat].apply(lambda x: ord(x) - ord('A'))
    print('Done\n')
print('Dropping idade, sexo, escola...')
df = df.drop(columns=['idade', 'sexo', 'escola'])
print('Done.\nSetting cidade and uf nat...')
df['cidade_nat'] = df['cidade_nat'].apply(lambda x: 1 if x else 0)
df['uf_nat'] = df['uf_nat'].apply(lambda x: 1 if x else 0)
print('Done.')

Dropping NA...
Done

Category escolaridade_pai
Done

Category escolaridade_mae
Done

Category emprego_pai
Done

Category emprego_mae
Done

Category renda
Done

Category quartos
Done

Category carros
Done

Category motos
Done

Category celulares
Done

Category computadores
Done

Category internet
Done

Dropping idade, sexo, escola...
Done.
Setting cidade and uf nat...
Done.


In [18]:
df

Unnamed: 0,id,uf,etnia,n_cn,n_ch,n_lc,n_mt,n_r1,n_r2,n_r3,...,renda,quartos,carros,motos,celulares,computadores,internet,cidade_nat,uf_nat,media
0,190001867757,PR,1,618.2,744.7,636.3,713.7,160.0,200.0,200.0,...,4,2,1,0,2,1,1,1,1,722.58
1,190001692704,RS,1,430.4,466.8,515.9,394.1,160.0,140.0,120.0,...,0,3,0,0,3,1,0,1,1,477.44
2,190001595660,SP,2,371.8,501.4,518.6,442.0,140.0,120.0,120.0,...,1,2,0,0,3,0,1,0,1,498.76
3,190001421552,BA,2,567.3,584.1,595.8,704.5,140.0,120.0,140.0,...,1,2,0,0,2,0,1,1,1,614.34
4,190001082475,AM,3,385.8,436.3,393.8,370.0,100.0,40.0,40.0,...,1,1,0,0,2,1,1,1,1,369.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868778,190005766801,MG,2,477.8,528.2,536.9,491.7,140.0,160.0,100.0,...,2,2,0,0,3,1,0,0,1,510.92
868779,190005441855,RO,1,536.0,564.4,579.1,479.4,120.0,120.0,160.0,...,4,2,1,0,4,2,1,0,0,559.78
868780,190005865715,SP,1,502.5,604.3,587.0,633.8,140.0,160.0,140.0,...,8,2,1,0,3,1,1,1,1,609.52
868781,190005947162,MS,1,492.5,534.6,518.5,409.9,0.0,0.0,0.0,...,6,2,0,0,1,0,1,1,1,391.10


In [19]:
etnias = {
    0: 'nao_declarado',
    1: 'branco',
    2: 'preto',
    3: 'pardo',
    4: 'amarelo',
    5: 'indígena'
}

rendas = {
    0: 'nenhuma',
    1: 'ate998',
    2: '998ate1497',
    3: '1497ate1996',
    4: '1996ate2495',
    5: '2495ate2994',
    6: '2994ate3992',
    7: '3992ate4990',
    8: '4990ate5988',
    9: '5988ate6986',
    10: '6986ate7984',
    11: '7984ate8982',
    12: '8982ate9980',
    13: '9980ate11976',
    14: '11976ate14970',
    15: '14970ate19960',
    16: '19960mais'
}

for num, etnia in etnias.items():
    df['etnia_' + etnia] = df['etnia'].apply(lambda x: 1 if x == num else 0)

for num, renda in rendas.items():
    df['renda_' + renda] = df['renda'].apply(lambda x: 1 if x == num else 0)

In [21]:
df.to_csv('preprocessed.csv', index=False)

In [22]:
len(df)

868783