In [2]:
import torch
from torch import nn as nn, optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torch.utils.data as data

# from sklearn.preprocessing import normalize

import os


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_mask(x):
    m = np.zeros_like(x)
    m[np.isnan(x)] = 1
    return m

def normalize(x):
    dim = x.shape[1]
    x = x.copy()
    ma = np.nanmax(x, axis=0)
    mi = np.nanmin(x, axis=0)
    
    for i in range(dim):
        x[np.isnan(x[:,i]),i] = np.nanmean(x[:,i])
    x = (x-mi)/(ma-mi+1e-6)
    return x, ma, mi

class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        self.config = config
        self.fc = nn.Sequential(
            nn.Linear(self.config['input_size']*2, self.config['hidden_size']),
            nn.Sigmoid()
        )
        self.output1 = nn.Sequential(
            nn.Linear(self.config['hidden_size'], self.config['input_size'])
        )
        
        self.output2 = nn.Sequential(
            nn.Linear(self.config['hidden_size'], 1)
        )
        
        self.output3 = nn.Sequential(
            nn.Linear(self.config['hidden_size'], 1),
            nn.Sigmoid()
        )
        
        self.output4 = nn.Sequential(
            nn.Linear(self.config['hidden_size'], self.config['input_size']),
            nn.Sigmoid()
        )
        
        self.weight_init()
        
    def forward(self, inputs):
        x, mask = inputs
        x1 = torch.cat([x, mask], dim=1)
        hidden= self.fc(x1)
        xhat = self.output1(hidden)
        yhat = self.output2(hidden)
        ghat = self.output3(hidden)
        mhat = self.output4(hidden)
        return xhat, yhat, ghat, mhat
    
    def weight_init(self):
        for n, m in self.named_modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)
                

In [60]:
config = {"input_size" :7,
         "hidden_size":100,}
model = Model(config)

In [61]:
optimizier_model = optim.Adam(model.parameters(), lr=0.001)

In [62]:
ds = pd.read_csv("C:/deep_match/data_lalonde/data_lalonde_MAR_0.5_1.csv", header=0, encoding="gbk")

In [63]:
ds.head()

Unnamed: 0,treat,age,educ,race,married,nodegree,re74,re75,re78
0,1,37,11,1,1,1,,0.0,9930.046
1,1,22,9,2,0,1,,,3595.894
2,1,30,12,1,0,0,0.0,0.0,24909.45
3,1,27,11,1,0,1,0.0,0.0,7506.146
4,1,33,8,1,0,1,0.0,0.0,289.7899


In [64]:
x = np.asarray(ds.iloc[:,list(range(1,8))])
g = np.asarray(ds.iloc[:,0])
y = np.asarray(ds.iloc[:,8])

mask = get_mask(x)
mask1 = mask.copy()
mask1 = torch.from_numpy(mask1).to(torch.float32)
x1, ma, mi = normalize(x)

In [65]:
x1 = torch.from_numpy(x1).to(torch.float32)
y = torch.from_numpy(y).to(torch.float32)
g = torch.from_numpy(g).to(torch.float32)

In [66]:
for epoch in range(1000):
    xhat, yhat, ghat, mhat = model([x1, mask1])
    loss1 = torch.mean(torch.pow(xhat-x1,2)*(1-mask1))
    # loss2 = torch.mean(torch.pow(yhat-y,2))
    loss3 = F.binary_cross_entropy(ghat, g) / (ghat.shape[0]*ghat.shape[1])
    loss4 = F.binary_cross_entropy(mhat, mask1) / (mhat.shape[0]*mhat.shape[1])
    loss = loss1+loss3+loss4
    optimizier_model.zero_grad()
    loss.backward()
    optimizier_model.step()
    if epoch % 100 == 0:
        # print("epoch:{} || loss:{:.3f} ||  loss1:{:.3f}|| loss2:{:.3f} || loss3:{:.3f}|| loss4:{:.3f} ".format(epoch,loss.data,loss1.data,loss2.data,loss3.data,loss4.data))
        print("epoch:{} || loss:{:.3f} ||  loss1:{:.3f}|| loss3:{:.3f}|| loss4:{:.3f} ".format(epoch,loss.data,loss1.data,loss3.data,loss4.data))


  """


epoch:0 || loss:0.944 ||  loss1:0.942|| loss3:0.001|| loss4:0.000 
epoch:100 || loss:0.080 ||  loss1:0.079|| loss3:0.001|| loss4:0.000 
epoch:200 || loss:0.050 ||  loss1:0.049|| loss3:0.001|| loss4:0.000 
epoch:300 || loss:0.028 ||  loss1:0.027|| loss3:0.001|| loss4:0.000 
epoch:400 || loss:0.015 ||  loss1:0.014|| loss3:0.001|| loss4:0.000 
epoch:500 || loss:0.008 ||  loss1:0.007|| loss3:0.001|| loss4:0.000 
epoch:600 || loss:0.005 ||  loss1:0.004|| loss3:0.001|| loss4:0.000 
epoch:700 || loss:0.004 ||  loss1:0.003|| loss3:0.001|| loss4:0.000 
epoch:800 || loss:0.003 ||  loss1:0.002|| loss3:0.001|| loss4:0.000 
epoch:900 || loss:0.002 ||  loss1:0.001|| loss3:0.001|| loss4:0.000 


In [67]:
xhat, yhat, ghat, mhat = model([x1, mask1])
xhat = xhat.data.numpy()*(ma-mi+1e-6)+mi
# xhat.mean(axis=0)

In [68]:
ghat = ghat.data.numpy()

In [69]:
name = [i+"_new" for i in ds.keys().tolist()[1:8]]
name += ["PS"]

In [70]:
ds1 = pd.DataFrame(np.concatenate([xhat,ghat], axis=1), columns=name)
ds = pd.concat([ds,ds1],axis=1)

In [71]:
ds.head()

Unnamed: 0,treat,age,educ,race,married,nodegree,re74,re75,re78,age_new,educ_new,race_new,married_new,nodegree_new,re74_new,re75_new,PS
0,1,37,11,1,1,1,,0.0,9930.046,36.87897,9.269598,1.010318,0.992788,0.939356,2508.95862,104.619616,0.419638
1,1,22,9,2,0,1,,,3595.894,22.058852,9.031122,2.010376,0.002382,1.005832,2433.479138,783.582575,0.335524
2,1,30,12,1,0,0,0.0,0.0,24909.45,29.312313,12.61876,0.987098,-0.004869,0.006271,-7.819524,1.717361,0.566652
3,1,27,11,1,0,1,0.0,0.0,7506.146,26.431692,9.749158,0.982333,-0.004816,0.962854,-138.173065,-220.54046,0.615466
4,1,33,8,1,0,1,0.0,0.0,289.7899,31.4456,8.361331,0.990595,0.001516,1.015937,-219.375419,-5.255533,0.613226


In [73]:
ds.to_csv(path_or_buf="C:/deep_match/data_lalonde/data_lalonde_ps_MAR_0.5_1.csv", index=False)