In [87]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import pandas as pd
import numpy as np
import sys
import random
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from distcorr import distcorr
from scipy.stats import pearsonr

In [47]:
np.set_printoptions(threshold=sys.maxsize, precision=4, suppress=True)

In [20]:
NAN = 0

def fill_nan(array, REPLACE_COUNT):
    array.flat[np.random.choice(array.size, int(REPLACE_COUNT), replace=False)] = NAN

    return array

In [177]:
pd.set_option('display.max_rows', None)
df = pd.read_excel('dutch_census.xls')
df.shape

(6303, 97)

In [178]:
df.columns = df.iloc[1, :]

In [179]:
df = df.drop(df.index[1])
df = df.drop(columns=['Bedrijf en Beroep', 'Telling', 'Tabel', 'Pagina links', 'Pagina rechts', 'Provincie', 'Image nr Links', 'Image nr Rechts'])

In [180]:
df.replace(0, np.nan, inplace=True)
df.shape

(6302, 89)

In [181]:
df = df.dropna(how='all')
df.shape

(5784, 89)

In [182]:
df = df.dropna(axis=1, thresh=df.shape[0]*0.5)
df.shape

(5784, 3)

In [183]:
df = df.drop(df.index[0])
df

1,20. Groot Amsterdam,28. Rotterdam,Het Rijk
5,41.0,59.0,89.0
9,,2.0,247.0
10,2.0,1.0,122.0
11,,1.0,426.0
12,,1.0,3770.0
13,,4.0,2727.0
14,,1.0,3928.0
15,,2.0,39.0
16,,4.0,1790.0
17,,12.0,12254.0


In [184]:
full_df = df.dropna()
full_df

1,20. Groot Amsterdam,28. Rotterdam,Het Rijk
5,41,59,89
10,2,1,122
29,3,16,14381
78,10,2,517
81,15,3,3103
92,6,1,770
93,24,4,4122
96,1,3,9
101,7,245,3998
104,7,297,4541


In [185]:
array = full_df.to_numpy()
array


       [69, 197, 1398],
       [120, 514, 5722],
       [33, 91, 657],
       [5, 44, 252],
       [6, 15, 661],
       [4, 20, 208],
       [9, 55, 1227],
       [12, 89, 552],
       [503, 504, 3247],
       [573, 854, 7271],
       [741, 1404, 14419],
       [4, 8, 50],
       [10, 10, 145],
       [36, 305, 955],
       [26, 335, 897],
       [62, 703, 2069],
       [59, 222, 955],
       [12, 14, 68],
       [21, 51, 209],
       [7, 18, 66],
       [18, 45, 110],
       [3, 7, 29],
       [87, 149, 442],
       [20, 75, 321],
       [227, 599, 2279],
       [303, 1320, 4547],
       [5, 1, 17],
       [3, 3, 22],
       [11, 33, 215],
       [6, 11, 82],
       [1, 10, 28],
       [1, 2, 10],
       [11, 29, 151],
       [34, 66, 423],
       [5, 5, 87],
       [66, 4, 561],
       [40, 103, 393],
       [74, 87, 866],
       [183, 194, 1871],
       [1, 11, 161],
       [1, 6, 87],
       [4, 5, 153],
       [3, 3, 40],
       [5, 22, 430],
       [12, 4, 234],
       [26, 51, 

In [186]:
# scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

# scaled_data = scaler.fit_transform(array)

In [187]:
# full_data = scaled_data.copy()
# missing_data = fill_nan(scaled_data, scaled_data.size*0.2)

In [188]:
# np.set_printoptions(threshold=sys.maxsize, precision=18, suppress=True)
# full_data

In [189]:
# x_train, x_test, y_train, y_test = train_test_split(missing_data, full_data, random_state=0)

In [190]:
full_data = array.copy()
missing_data = fill_nan(array, array.size*0.2)

In [191]:
x_train, x_test, y_train, y_test = train_test_split(missing_data, full_data, random_state=0)

In [222]:
NUM_EPOCHS = 120
LEARNING_RATE = 0.0001
BATCH_SIZE = -1
NUM_FEATURES = 3
BATCH_SIZE_TEST = -1

In [194]:
x_train = torch.from_numpy(x_train.astype(float))
#x_train = x_train.to(device)
x_train = x_train.view(BATCH_SIZE, NUM_FEATURES)

y_train = torch.from_numpy(y_train.astype(float))
#y_train = y_train.to(device)
y_train = y_train.view(BATCH_SIZE, NUM_FEATURES)

x_test = torch.from_numpy(x_test.astype(float))
#x_test = x_test.to(device)
x_test = x_test.view(BATCH_SIZE_TEST, NUM_FEATURES)

y_test = torch.from_numpy(y_test.astype(float))
#y_test = y_test.to(device)
y_test = y_test.view(BATCH_SIZE_TEST, NUM_FEATURES)

x_train.shape

torch.Size([1909, 3])

In [223]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        self.enc1 = nn.Linear(in_features=3, out_features=8)
        self.enc2 = nn.Linear(in_features=8, out_features=16)

        self.dec1 = nn.Linear(in_features=16, out_features=8)
        self.dec2 = nn.Linear(in_features=8, out_features=3)

    def forward(self, x):
        x = F.leaky_relu(self.enc1(x))
        x = F.leaky_relu(self.enc2(x))
        x = F.leaky_relu(self.dec1(x))
        x = self.dec2(x)
        return x

In [224]:
# CREATE A SEED FOR CONSISTENT WEIGHT INITIALIZATIONS - FOR TESTING PURPOSES
random.seed(2)
torch.manual_seed(random.randint(1, 10))
net = Autoencoder().double()
#net.to(device)

In [225]:
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [226]:
def train(net):
    train_loss = []
    torch.set_printoptions(precision=2, sci_mode=False)
    np.set_printoptions(precision=2, suppress=True)

    for epoch in range(NUM_EPOCHS):
        # running_loss: LOSS OF THE PREDICTED MISSING VALUE ONLY
        # overall_loss: LOSS OF ALL RECONSTRUCTED VALUES

        running_loss = 0.0
        overall_loss = 0.0
        count = 0
        for missing_data, full_data in zip(x_train, y_train):
            optimizer.zero_grad()
            outputs = net(missing_data.double())

            # LEARN FROM LOSS OF ALL RECONSTRUCTED VALUES
            loss = criterion(outputs, full_data)
            loss.backward()
            optimizer.step()
            overall_loss += loss.item()
            
            # COMPUTE LOSS OF PREDICTED MISSING VALUE
            if NAN in missing_data:
                for index in range(len(missing_data)):
                    if missing_data[index] == NAN:
                        predicted_loss = criterion(outputs[index], full_data[index])
                        running_loss += predicted_loss.item()
                        count += 1
            
            # PRINT ALL VALUES ON LAST EPOCH FOR TESTING PURPOSES
            if epoch == NUM_EPOCHS-1:
                if missing_data.detach().numpy().all() == full_data.detach().numpy().all():
                    print("Input: ", missing_data)
                    print("Target: ", full_data)
                    print("Outputs: ", outputs)
                else:
                    print("Input (missing): ", missing_data)
                    print("Target (missing): ", full_data)
                    print("Outputs (missing): ", outputs)
        
      #  loss = running_loss / count
        overall_loss = overall_loss / len(x_train)
        x_loss = running_loss / count
        train_loss.append(loss)



        print('Epoch {} of {}, Train Loss: {:.5f}, Overall: {:.5f}'
             .format(epoch+1, NUM_EPOCHS, x_loss, overall_loss))

    return train_loss


def test(net):

    net.eval()
    count = 0

    with torch.no_grad():
        test_loss = []
        running_loss = 0.0
        overall_loss = 0.0

        for missing_data, full_data in zip(x_test, y_test):
            outputs = net(missing_data.double())

            # LEARN FROM LOSS OF ALL RECONSTRUCTED VALUES
            loss = criterion(outputs, full_data)
            overall_loss += loss.item()

            if NAN in missing_data:
                for index in range(len(missing_data)):
                    if missing_data[index] == NAN:
                        predicted_loss = criterion(outputs[index], full_data[index])
                        running_loss += predicted_loss.item()
                        count += 1
            
            if missing_data.detach().numpy().all() == full_data.detach().numpy().all():
                print("Input: ", missing_data)
                print("Target: ", full_data)
                print("Outputs: ", outputs)
            else:
                print("Input (missing): ", missing_data)
                print("Target (missing): ", full_data)
                print("Outputs (missing): ", outputs)

        overall_loss = overall_loss / len(x_test)
        x_loss = running_loss / count
        test_loss.append(overall_loss)
        print('Test Loss: {:.3f}, Overall: {:.5f}'.format(x_loss, overall_loss))

        return test_loss

In [227]:
results = train(net)

, dtype=torch.float64)
Outputs (missing):  tensor([ 1.42,  2.50, 16.00], dtype=torch.float64, grad_fn=<AddBackward0>)
Input (missing):  tensor([ 23.,   0., 340.], dtype=torch.float64)
Target (missing):  tensor([ 23.,  12., 340.], dtype=torch.float64)
Outputs (missing):  tensor([ 28.51,  22.16, 335.36], dtype=torch.float64, grad_fn=<AddBackward0>)
Input:  tensor([ 15296.,  14339., 160676.], dtype=torch.float64)
Target:  tensor([ 15296.,  14339., 160676.], dtype=torch.float64)
Outputs:  tensor([ 13663.52,  10167.07, 166042.77], dtype=torch.float64,
       grad_fn=<AddBackward0>)
Input:  tensor([ 1.,  1., 12.], dtype=torch.float64)
Target:  tensor([ 1.,  1., 12.], dtype=torch.float64)
Outputs:  tensor([ 2.71,  2.73, 24.36], dtype=torch.float64, grad_fn=<AddBackward0>)
Input (missing):  tensor([ 229.,    0., 1030.], dtype=torch.float64)
Target (missing):  tensor([ 229.,   76., 1030.], dtype=torch.float64)
Outputs (missing):  tensor([  96.39,   74.09, 1161.60], dtype=torch.float64,
       g

In [228]:
test_result = test(net)

g):  tensor([ 14.,   3., 284.], dtype=torch.float64)
Outputs (missing):  tensor([ 1.10,  6.46, 51.88], dtype=torch.float64)
Input (missing):  tensor([2., 5., 0.], dtype=torch.float64)
Target (missing):  tensor([ 2.,  5., 18.], dtype=torch.float64)
Outputs (missing):  tensor([ 1.17,  3.52, 25.14], dtype=torch.float64)
Input (missing):  tensor([  0.,  11., 102.], dtype=torch.float64)
Target (missing):  tensor([ 26.,  11., 102.], dtype=torch.float64)
Outputs (missing):  tensor([  9.66,   7.74, 108.20], dtype=torch.float64)
Input:  tensor([ 190.,   76., 1136.], dtype=torch.float64)
Target:  tensor([ 190.,   76., 1136.], dtype=torch.float64)
Outputs:  tensor([ 103.91,   78.65, 1258.75], dtype=torch.float64)
Input:  tensor([10.,  3., 92.], dtype=torch.float64)
Target:  tensor([10.,  3., 92.], dtype=torch.float64)
Outputs:  tensor([  9.43,   7.71, 105.34], dtype=torch.float64)
Input (missing):  tensor([26., 13.,  0.], dtype=torch.float64)
Target (missing):  tensor([ 26.,  13., 169.], dtype=to

In [201]:
torch.save(net, './model5')

In [229]:
predicted_set = net(x_test.double())
predicted_set = predicted_set.detach().numpy()
predicted_set

84.12],
       [    67.85,     62.77,    969.1 ],
       [   535.2 ,    400.34,   6489.15],
       [   248.09,    203.69,   3275.7 ],
       [    -5.27,     31.95,    291.45],
       [    41.21,     32.6 ,    513.24],
       [   -13.04,     66.9 ,    618.91],
       [     7.86,      6.6 ,     86.39],
       [     1.99,      2.23,     15.94],
       [  7004.46,   5335.27,  85302.47],
       [   145.94,    110.44,   1759.71],
       [   966.76,    764.11,  12436.16],
       [     1.03,      4.14,     30.6 ],
       [    23.31,     18.16,    273.24],
       [   221.17,    165.44,   2690.12],
       [     1.35,      4.21,     31.55],
       [     7.23,      6.01,     78.77],
       [    67.21,     51.16,    805.47],
       [    -0.25,     15.82,    136.22],
       [    12.37,     10.  ,    141.12],
       [  1426.47,   1074.67,  17351.5 ],
       [   121.53,     90.33,   1477.86],
       [    47.67,     36.36,    572.35],
       [    -0.66,     14.63,    124.89],
       [    94.06,     70.

In [230]:
real_values = []
predicted_values = []
x_values = []

for missing_data, full_data, predicted in zip(x_test, y_test, predicted_set):
    if NAN in missing_data:
        for i in range(len(missing_data)):
            if missing_data[i] == NAN:
                x_values.append(full_data[0].item())
                real_values.append(full_data[i].item())
                predicted_values.append(predicted[i].item())
print(real_values)

[3.0, 17.0, 28.0, 8.0, 41.0, 132.0, 1487.0, 108.0, 1.0, 4.0, 262.0, 1.0, 132.0, 605.0, 201.0, 59.0, 4.0, 76.0, 50.0, 2735.0, 3325.0, 319.0, 76.0, 927.0, 36.0, 197.0, 117.0, 20.0, 8.0, 6.0, 31.0, 9.0, 1.0, 752.0, 8.0, 13.0, 233.0, 114.0, 132.0, 1.0, 69.0, 82.0, 83.0, 18.0, 9.0, 1.0, 16.0, 295.0, 8.0, 4442.0, 15.0, 14.0, 1.0, 7.0, 7.0, 1.0, 360.0, 247.0, 310.0, 158.0, 2.0, 2.0, 40.0, 18.0, 61.0, 6.0, 47.0, 26.0, 2.0, 305.0, 338.0, 24.0, 14.0, 4.0, 4.0, 6.0, 18.0, 42.0, 3.0, 511.0, 1.0, 27.0, 222.0, 1.0, 20.0, 2.0, 175.0, 48.0, 139.0, 801.0, 459.0, 2779.0, 7.0, 1.0, 3244.0, 24.0, 90.0, 41.0, 49.0, 7.0, 383.0, 26.0, 215.0, 1.0, 15.0, 428.0, 34.0, 964.0, 114.0, 282.0, 245.0, 1924.0, 3.0, 20496.0, 99.0, 32.0, 1.0, 2878.0, 13.0, 763.0, 316.0, 12.0, 775.0, 2539.0, 69.0, 1.0, 13.0, 1.0, 271.0, 28.0, 1174.0, 9.0, 52.0, 1200.0, 24.0, 725.0, 3211.0, 43.0, 24488.0, 11.0, 5.0, 4.0, 2.0, 13.0, 247.0, 302.0, 80.0, 856.0, 49.0, 331.0, 396.0, 29.0, 35.0, 11247.0, 563.0, 10.0, 5.0, 26.0, 6.0, 15592.0, 10

In [231]:
from scipy import stats
from statsmodels.stats import weightstats as stets

ttest, pval = stats.ttest_ind(real_values, predicted_values)
print("P-value for significance: ", pval)
print("TTEST: ", ttest)

if pval.any()<0.05:
    print("Conclusion: Reject Null Hypothesis")
else:
    print("Conclusion: Accept Null Hypothesis")

P-value for significance:  0.1403450578777213
TTEST:  1.476103157848005
Conclusion: Accept Null Hypothesis
