In [74]:
import numpy as np

from sklearn.preprocessing import LabelEncoder
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import pandas as pd
import pytorch_lightning as pl

import torch
from torch import nn
from torch import functional as F

#import distvis

In [81]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
datos = pd.read_csv('../datos/datos_procesados.csv').iloc[:, 1:]

In [4]:
numerical_features = [
    'edad', 'meses_empleo'
]

categorical_features = [
    'sexo', 'rol_hogar', 'tipo_zona', 
    'termino_nivel', 'cine', 'est_conyugal', 'region', 'comuna', 'provincia',
    'est_subcontratado', 'categoria_empleo', 'grupo_ocupacion', 'nacionalidad'
]

features = categorical_features + numerical_features
target = "sueldo_neto"

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.NaN)), #dummy imputer
    ('scaler', StandardScaler())
    ])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=None, strategy='constant', fill_value=-1)),
    ('label', OrdinalEncoder(handle_unknown="return_nan")) #uknown values goes to -1
    ])

preprocessor = ColumnTransformer(               #preprocessor rearenge columns
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numerical_features)]
)

#target_scaler = StandardScaler()

preprocessor.fit(datos[features])
#target_scaler.fit(datos[target])

ColumnTransformer(transformers=[('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value=-1,
                                                                missing_values=None,
                                                                strategy='constant')),
                                                 ('label',
                                                  OrdinalEncoder(handle_unknown='return_nan'))]),
                                 ['sexo', 'rol_hogar', 'tipo_zona',
                                  'termino_nivel', 'cine', 'est_conyugal',
                                  'region', 'comuna', 'provincia',
                                  'est_subcontratado', 'categoria_empleo',
                                  'grupo_ocupacion', 'nacionalidad']),
                                ('num',
                                 Pipeline(steps=[('imputer',
                                  

In [84]:
transform_features = preprocessor.transform(datos[features])

cat_indices = [features.index(x) for x in categorical_features]
num_indices = [features.index(x) for x in numerical_features]

x_cat = torch.LongTensor(transform_features[:, cat_indices]).to(device)
x_num = torch.FloatTensor(transform_features[:, num_indices]).to(device)

In [34]:
emb_szs = {feature: 5 for feature in categorical_features}
n_categories = {f: transform_features[:, features.index(f)].max().astype(int) for f in categorical_features}
n_categories

{'sexo': 2,
 'rol_hogar': 11,
 'tipo_zona': 3,
 'termino_nivel': 2,
 'cine': 10,
 'est_conyugal': 7,
 'region': 16,
 'comuna': 278,
 'provincia': 52,
 'est_subcontratado': 3,
 'categoria_empleo': 4,
 'grupo_ocupacion': 10,
 'nacionalidad': 9}

https://github.com/fastai/fastai/blob/eda1a2e50980b1ec2df127ae431b8bdbf1a84877/fastai/tabular/model.py#L28

In [85]:
class LinearAct(nn.Sequential):
    def __init__(self, n_in, n_out, act=None):
        layers = [nn.Linear(n_in, n_out)]
        if act is not None: layers.append(act)
        super().__init__(*layers)
        
class TabularModel(pl.LightningModule):
    
    def __init__(self, emb_szs: dict, n_num: int, layers: list):
        super(TabularModel, self).__init__()
        self.embeddings = {f: nn.Embedding(n_categories[f]+1, emb_szs[f]) for f in categorical_features}
        n_emb = sum(emb_sz for emb_sz in emb_szs.values())
        layer_sizes = [n_emb + n_num] + layers + [1]
        actns = [nn.ReLU(inplace=True) for _ in range(len(layer_sizes)-2)] + [None]
        _layers = [LinearAct(layer_sizes[i], layer_sizes[i+1], act=act)
                       for i, act in enumerate(actns)]
        self.layers = nn.Sequential(*_layers)
        self.loss = torch.nn.MSELoss
        
    def forward(self, x_cat, x_num):
        x = [emb(x_cat[:, categorical_features.index(f)]) for f, emb in self.embeddings.items()] 
        x = torch.cat(x, 1)
        x = torch.cat([x, x_num], dim=1)
        return self.layers(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y)
        self.log('train_loss', loss)
        return loss
    

In [86]:
model = TabularModel(emb_szs=emb_szs, n_num=len(numerical_features), layers=[10, 5])

In [89]:
trainer = pl.Trainer(
    min_epochs=10, 
    max_epochs=200, 
    gpus=1, 
    deterministic=True
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
