In [1]:
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
import torch as t
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader

In [3]:
from src.caseDataset import CaseDataset

## Check GPU Availability

In [4]:
# Check GPU status:
print('PyTorch version:',t.__version__)
useGPU = t.cuda.is_available()
if(useGPU):
    for i in range(t.cuda.device_count()):
        print('Device ',i,':',t.cuda.get_device_name(i))
    print('Current: Device ',t.cuda.current_device())
    dev = t.cuda.current_device()
    t.backends.cudnn.benchmark = True 
else:
    print('No GPU')

PyTorch version: 1.8.2
Device  0 : NVIDIA GeForce GTX 1050
Current: Device  0


## Load Data

In [5]:
# load data
txtSubname = "posneg"
oldData = pd.read_csv('./data/oldData{}.csv'.format(txtSubname)).iloc[:, 1:]
youngData = pd.read_csv('./data/youngData{}.csv'.format(txtSubname)).iloc[:, 1:]
totalData = pd.read_csv('./data/totalData{}.csv'.format(txtSubname)).iloc[:, 1:]
coef = np.loadtxt('./data/coef.txt')
A, B, C, D = coef[0], coef[1], coef[2], coef[3]
totalData

Unnamed: 0,T,Age,Sex,yo1,yo1CF,y1,y1CF,yo2,yo2CF,y2,y2CF,yo3,yo3CF,y3,y3CF
0,1.0,0.0,1.0,115.0,5.0,110.610652,4.317240,20.0,5.0,17.619007,4.069660,20.0,5.0,20.903597,4.170573
1,0.0,1.0,0.0,0.0,15.0,2.191119,20.119197,45.0,60.0,46.653115,61.838370,45.0,155.0,46.202193,158.470027
2,0.0,0.0,0.0,0.0,15.0,-0.130617,10.682771,0.0,15.0,5.759354,16.435563,0.0,15.0,4.071051,11.428597
3,1.0,0.0,1.0,115.0,5.0,111.550584,3.727076,20.0,5.0,19.097982,7.984111,20.0,5.0,23.621068,5.260064
4,0.0,1.0,0.0,0.0,15.0,-2.205694,15.010239,45.0,60.0,45.862377,53.040300,45.0,155.0,45.128898,156.468445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.0,1.0,0.0,15.0,0.0,14.928229,0.729263,60.0,45.0,64.363639,40.034932,155.0,45.0,154.305709,39.683783
996,1.0,1.0,0.0,15.0,0.0,14.801017,2.434600,60.0,45.0,62.497828,47.653480,155.0,45.0,159.109939,54.515614
997,1.0,0.0,0.0,15.0,0.0,12.849779,-1.761553,15.0,0.0,11.587064,3.350953,15.0,0.0,12.526916,3.222415
998,1.0,0.0,0.0,15.0,0.0,17.738869,1.047301,15.0,0.0,13.969011,-3.082244,15.0,0.0,8.748484,-3.938749


In [6]:
totalData.describe()

Unnamed: 0,T,Age,Sex,yo1,yo1CF,y1,y1CF,yo2,yo2CF,y2,y2CF,yo3,yo3CF,y3,y3CF
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.5,0.5,0.5,33.75,33.75,33.813147,33.809589,56.25,56.25,56.244391,56.222005,42.0,70.5,42.088497,70.597043
std,0.50025,0.50025,0.50025,47.243194,47.243194,47.315784,47.42261,55.092855,62.197717,55.176961,62.305799,41.835992,72.266676,41.963273,72.36368
min,0.0,0.0,0.0,0.0,0.0,-7.251547,-7.910268,0.0,0.0,-7.819061,-8.890424,0.0,0.0,-7.620151,-5.797102
25%,0.0,0.0,0.0,3.75,3.75,2.604976,2.670565,15.0,5.0,16.378985,3.91993,15.0,5.0,16.476223,3.533492
50%,0.5,0.5,0.5,10.0,10.0,9.655516,10.18248,32.5,32.5,31.361843,33.236553,32.5,32.5,32.942022,30.508703
75%,1.0,1.0,1.0,40.0,40.0,45.814532,42.608621,81.25,81.25,82.11308,85.30593,50.0,155.0,48.764678,156.235865
max,1.0,1.0,1.0,115.0,115.0,122.130877,124.615563,160.0,160.0,166.494062,168.417906,160.0,160.0,167.863258,168.851542


In [7]:
trainData, testData = train_test_split(totalData, test_size=0.2)

caseType = 2
trainCaseData = CaseDataset(trainData, caseType=caseType)
testCaseData = CaseDataset(testData, caseType=caseType)
print("TrainCase Treat Rate: {}".format(np.mean(trainCaseData.T == 1)))
print("TestCase Treat Rate: {}".format(np.mean(testCaseData.T == 1)))

TrainCase Treat Rate: 0.5075
TestCase Treat Rate: 0.47


In [8]:
iteration = 1
trainDataLoader = DataLoader(trainCaseData, batch_size=int(len(trainCaseData)/iteration), shuffle=True)
print("Batched Training Shape of")
for i, (batchFeature, batchTarget) in enumerate(trainDataLoader):
    print("No. {} | Feature: {} | Target: {}".format(i, batchFeature.shape, batchTarget.shape))
print("\n")
testDataLoader = DataLoader(testCaseData, batch_size=int(len(testCaseData)/iteration), shuffle=False)
testFeature, testTarget = next(iter(testDataLoader))
print("Test Shape of \nFeature: {} | Target: {}".format(testFeature.shape, testTarget.shape))

Batched Training Shape of
No. 0 | Feature: torch.Size([800, 3]) | Target: torch.Size([800])


Test Shape of 
Feature: torch.Size([200, 3]) | Target: torch.Size([200])


---

## Model Build Up

In [9]:
from src.neuralNetwork import *

In [10]:
model = NeuralNetwork()
loss_fn = t.nn.MSELoss()
learningRate = 1e-5
optimizer = t.optim.RMSprop(model.parameters())#, lr=learningRate)


model = model.to(dev)
loss_fn = loss_fn.to(dev)
if useGPU:
    print('---------- Turn Model/LossFunction into GPU ----------')
else:
    print('---------- Turn Model/LossFunction into CPU ----------')
print(model)

---------- Turn Model/LossFunction into GPU ----------
NeuralNetwork(
  (linearLayer): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [11]:
numEpochs = 3000

for epochIdx in range(numEpochs+1):
    loss = trainLoop(dataLoader=trainDataLoader, model=model, optimizer=optimizer, lossFn=loss_fn, dev=dev)
    if epochIdx % 500 == 0:
        print('-' * 20 + str(epochIdx) + '-' * 20)    
        print('Loss: {}'.format(loss))


print('=' * 10 + 'COMPLETE' + '=' * 10)

--------------------0--------------------
Loss: 6195.02685546875
--------------------500--------------------
Loss: 9.590757369995117
--------------------1000--------------------
Loss: 9.722095489501953


KeyboardInterrupt: 

In [None]:
yTrainPred = evaluate(model, trainCaseData.feature, dev=dev)
yTrainPredCF = evaluate(model, trainCaseData.featureCF, dev=dev)

In [None]:
doDraw = True

In [None]:
import matplotlib.pyplot as plt 
import numpy as np 
def drawOutcomesVersusRandom(trainCaseData, yPred, yType, coefficients, title, yPredCF=None):
    uniqueOutcome, countUnique = np.unique(trainCaseData.data['yo'+yType], return_counts=True)
    A, B, C, D = coefficients[0], coefficients[1], coefficients[2], coefficients[3]
    fig, axs = plt.subplots(1, 1, figsize=(10, 10))
    for i in range(len(uniqueOutcome)):
        # ========== Old && Treated
        if np.sum([(trainCaseData.data['T'] == 1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])]):
            condition = (trainCaseData.data['T'] == 1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])
            VP = plt.violinplot(np.array(trainCaseData.data['y'+yType][condition]), \
                          positions=[uniqueOutcome[i]], widths=15)
            for pc in VP['bodies']:
                pc.set_facecolor('green')
                pc.set_edgecolor('green')
            for partname in ('cbars','cmins','cmaxes'):
                vp = VP[partname]
                vp.set_edgecolor('green')
                vp.set_facecolor('green')
                vp.set_linewidth(1)
            plt.text(x=uniqueOutcome[i], y=uniqueOutcome[i] + 1.1 * (np.max(trainCaseData.data['y'+yType][trainCaseData.data['yo'+yType] == uniqueOutcome[i]])-uniqueOutcome[i]), 
                     s=np.sum(condition), #np.array(countUnique, dtype=str)[i], \
                     color='green', fontsize=25, ha='center')
        # ========== Young && Treated
        elif np.sum([(trainCaseData.data['T'] == 1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])]):
            condition = (trainCaseData.data['T'] == 1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])
            VP = plt.violinplot(np.array(trainCaseData.data['y'+yType][condition]), \
                          positions=[uniqueOutcome[i]], widths=15)
            for pc in VP['bodies']:
                pc.set_facecolor('green')
                pc.set_edgecolor('green')
            for partname in ('cbars','cmins','cmaxes'):
                vp = VP[partname]
                vp.set_edgecolor('green')
                vp.set_facecolor('green')
                vp.set_linewidth(1)
            plt.text(x=uniqueOutcome[i], y=uniqueOutcome[i] + 1.1 * (np.max(trainCaseData.data['y'+yType][trainCaseData.data['yo'+yType] == uniqueOutcome[i]])-uniqueOutcome[i]), 
                     s=np.sum(condition), #np.array(countUnique, dtype=str)[i], \
                     color='green', fontsize=25, ha='center')
        # ========== Old && Untreated
        elif np.sum([(trainCaseData.data['T'] == -1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])]):
            condition = (trainCaseData.data['T'] == -1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])
            VP = plt.violinplot(np.array(trainCaseData.data['y'+yType][condition]), \
                          positions=[uniqueOutcome[i]], widths=15)
            for pc in VP['bodies']:
                pc.set_facecolor('red')
                pc.set_edgecolor('red')
            for partname in ('cbars','cmins','cmaxes'):
                vp = VP[partname]
                vp.set_edgecolor('red')
                vp.set_facecolor('red')
                vp.set_linewidth(1)
            plt.text(x=uniqueOutcome[i], y=uniqueOutcome[i] + 1.1 * (np.max(trainCaseData.data['y'+yType][trainCaseData.data['yo'+yType] == uniqueOutcome[i]])-uniqueOutcome[i]), 
                     s=np.sum(condition), #np.array(countUnique, dtype=str)[i], \
                     color='red', fontsize=25, ha='center')
        # ========== Young && Untreated
        elif np.sum([(trainCaseData.data['T'] == -1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])]):
            condition = (trainCaseData.data['T'] == -1) * (trainCaseData.data['yo'+yType] == uniqueOutcome[i])
            VP = plt.violinplot(np.array(trainCaseData.data['y'+yType][condition]), \
                          positions=[uniqueOutcome[i]], widths=15)
            for pc in VP['bodies']:
                pc.set_facecolor('red')
                pc.set_edgecolor('red')
            for partname in ('cbars','cmins','cmaxes'):
                vp = VP[partname]
                vp.set_edgecolor('red')
                vp.set_facecolor('red')
                vp.set_linewidth(1)
            plt.text(x=uniqueOutcome[i], y=uniqueOutcome[i] + 1.1 * (np.max(trainCaseData.data['y'+yType][trainCaseData.data['yo'+yType] == uniqueOutcome[i]])-uniqueOutcome[i]), 
                     s=np.sum(condition), #np.array(countUnique, dtype=str)[i], \
                     color='red', fontsize=25, ha='center')
    size = 70
    plt.scatter(trainCaseData.data['yo'+yType], trainCaseData.data['yo'+yType], color='black', zorder=3)
    plt.scatter(trainCaseData.data['yo'+yType], trainCaseData.data['yo'+yType+'CF'], color='black', zorder=3, marker='x')
    if title == "Total":
        plt.scatter(trainCaseData.data['yo'+yType], yPred, color='blue', zorder=3, alpha=0.5, label='Total Prediction', s=size)
        if yPredCF is not None: 
            plt.scatter(trainCaseData.data['yo'+yType], yPredCF, \
                        color='red', zorder=3, alpha=0.5, label='Total CF Prediction', s=size*0.5, marker='x')
        
    elif title == "Old":
        plt.scatter(trainCaseData.data['yo'+yType][trainCaseData.Age == 1], yPred[trainCaseData.Age == 1], color='blue', \
                    zorder=3, alpha=0.5, label='Old Prediction', s=size)
        if yPredCF is not None: 
            plt.scatter(trainCaseData.data['yo'+yType][trainCaseData.Age == 1], yPredCF[trainCaseData.Age == 1], \
                        color='red', zorder=3, alpha=0.5, label='Old CF Prediction', s=size*0.5, marker='x')

    elif title == "Young":
        plt.scatter(trainCaseData.data['yo'+yType][trainCaseData.Age == -1], yPred[trainCaseData.Age == -1], \
                    color='blue', zorder=3, alpha=0.5, label='Young Prediction', s=size)
        if yPredCF is not None: 
            plt.scatter(trainCaseData.data['yo'+yType][trainCaseData.Age == -1], yPredCF[trainCaseData.Age == -1], \
                        color='red', zorder=3, alpha=0.5, label='Young CF Prediction', s=size*0.5, marker='x')
            
    plt.xlabel('Without Noise', fontsize=30)
    plt.xticks(fontsize=30)
    plt.ylabel('With Noise', fontsize=30)
    plt.yticks(fontsize=30)
    plt.ylim(np.min(uniqueOutcome)-(np.max(uniqueOutcome)-np.min(uniqueOutcome))*0.2, np.max(uniqueOutcome)+(np.max(uniqueOutcome)-np.min(uniqueOutcome))*0.5)
    plt.xlim(np.min(uniqueOutcome)-(np.max(uniqueOutcome)-np.min(uniqueOutcome))*0.2, np.max(uniqueOutcome)+(np.max(uniqueOutcome)-np.min(uniqueOutcome))*0.5)
    plt.legend(fontsize=20, loc='upper right')
    if title:
        plt.title("Prediction | Case {} | {}".format(yType, title), fontsize=30)
    plt.show()


In [None]:
if doDraw:
    yType = str(caseType)
    drawOutcomesVersusRandom(trainCaseData=trainCaseData, \
                             yType=yType, \
                             yPred=yTrainPred, \
                             coefficients=coef, \
                             title='Total', 
                             yPredCF=None)#yTrainPredCF)
    
    #drawOutcomesVersusRandom(trainCaseData=trainCaseData, 
    #                         yType=yType, 
    #                         yPred=yTrainPred, 
    #                         coefficients=coef, 
    #                         title='Old',
    #                         yPredCF=yTrainPredCF)

    #drawOutcomesVersusRandom(trainCaseData=trainCaseData, 
    #                         yType=yType, 
    #                         yPred=yTrainPred, 
    #                         coefficients=coef, 
    #                         title='Young',
    #                         yPredCF=yTrainPredCF)
    
    #drawOutcomesVersusRandom(totalData, yType=yType, yPred=yPred[trainCaseData.data['Age'] == 0], trainCaseData=trainCaseData[trainCaseData.data['Age'] == 0], coefficients=coef, title='Old')

In [None]:
print("{:05f}/{:05f}".format(RMSE(yTrainPred[:, 0], trainCaseData.data['y1']), RMSE(trainCaseData.data['yo1'], trainCaseData.data['y1'])))
print("{:05f}/{:05f}".format(RMSE(yTrainPredCF[:, 0], trainCaseData.data['y1CF']), RMSE(trainCaseData.data['yo1CF'], trainCaseData.data['y1CF'])))

In [None]:
#del model

In [None]:
yTestPred = evaluate(model, testCaseData.feature, dev)
yTestPredCF = evaluate(model, testCaseData.featureCF, dev)

In [None]:
if doDraw:
    yType = str(caseType)
    drawOutcomesVersusRandom(trainCaseData=testCaseData, \
                             yType=yType, \
                             yPred=yTestPred, \
                             coefficients=coef, \
                             title='Total', 
                             yPredCF=None)#yTestPredCF)
    
    #drawOutcomesVersusRandom(trainCaseData=testCaseData, 
    #                         yType=yType, 
    #                         yPred=yTestPred, 
    #                         coefficients=coef, 
    #                         title='Old',
    #                         yPredCF=yTestPredCF)

    #drawOutcomesVersusRandom(trainCaseData=testCaseData, 
    #                         yType=yType, 
    #                         yPred=yTestPred, 
    #                         coefficients=coef, 
    #                         title='Young',
    #                         yPredCF=yTestPredCF)

In [None]:
print("{:05f}/{:05f}".format(RMSE(yTestPred[:, 0], testCaseData.data['y1']), RMSE(testCaseData.data['yo1'], testCaseData.data['y1'])))
print("{:05f}/{:05f}".format(RMSE(yTestPredCF[:, 0], testCaseData.data['y1CF']), RMSE(testCaseData.data['yo1CF'], testCaseData.data['y1CF'])))

In [None]:
PATH="./modelStorage/case{}Model.pth".format(caseType)
t.save(model.state_dict(), PATH)

In [None]:
the_model = NeuralNetwork()
the_model.load_state_dict(t.load(PATH))