In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Content
* [Data Generation](#Data-Generation)
    1. [Parameter Setup](#Parameter-Setup)
    2. [Patient Data Setup](#Patient-Data-Setup)
    3. [Quick View](#Quick-View)

    * [Case 1](#Case-1:-Apparent-Imbalance)

    * [Case 2](#Case-2:-Apparent-Imbalance)

    * [Case 3](#Case-3:-Genuine-Imbalance)
    
* [NN Performance](#NN-Performance)

# Data Generation

## Parameter Setup

In [2]:
A = round(np.random.uniform(0, 1), 2)
B = round(np.random.uniform(0, 1), 2)
C = round(np.random.uniform(0, 1), 2)
D = round(np.random.uniform(0, 1), 2)
print("A: {}\nB: {}\nC: {}\nD: {}".format(A, B, C, D))

A: 0.51
B: 0.46
C: 0.14
D: 0.68


## Patient Data Setup

In [None]:
Nold, Nyoung = 500, 500
oldTreatRate,youngTreatRate = 0.01, 0.99 # *100%

# Sex (binary, 0 is female and 1 is male)
oldSex = np.random.choice([0, 1], p=[0.5, 0.5], size=(Nold, ))
youngSex = np.random.choice([0, 1], p=[0.5, 0.5], size=(Nyoung, ))

# Treatment
oldT = np.array([0] * int(Nold * round(1-oldTreatRate, 2)) + [1] * int(Nold * round(oldTreatRate, 2)))
youngT = np.array([0] * int(Nyoung * round(1-youngTreatRate, 2)) + [1] * int(Nyoung * round(youngTreatRate, 2)))

# Age (binary, 0 is young and 1 is old)
oldAge = np.full(Nold, 1)
youngAge = np.full(Nyoung, 0)

In [None]:
oldData = np.vstack((oldT, oldAge, oldSex)).T
oldData = pd.DataFrame(oldData)
oldData = oldData.rename(columns={0:'T', 1:'Age', 2:'Sex'})
oldData = oldData.sample(frac=1)

youngData = np.vstack((youngT, youngAge, youngSex)).T
youngData = pd.DataFrame(youngData)
youngData = youngData.rename(columns={0:'T', 1:'Age', 2:'Sex'})
youngData = youngData.sample(frac=1)

In [None]:
print(oldData.describe())
print('-'*50)
print(youngData.describe())

---

## Case 1: Apparent Imbalance
$$y = A\times Sex +$$
$$B\times T + $$
$$D\times Sex \times T$$

In [None]:
def getOutcome1(data):
    return A * data['Sex'] + B * data['T'] + D * data['Sex'] * data['T']

oldData['yo1'] = getOutcome1(oldData)
oldData['y1'] = oldData['yo1'] + np.random.normal(loc=0, scale=0.1, size=(Nold, ))
youngData['yo1'] = getOutcome1(youngData)
youngData['y1'] = youngData['yo1'] + np.random.normal(loc=0, scale=0.1, size=(Nold, ))

## Case 2: Apparent Imbalance
$$y = A\times Sex +$$
$$B\times T + $$
$$C\times Age + $$
$$D\times Age\times Sex$$

In [None]:
def getOutcome2(data):
    return A * data['Sex'] + B * data['T'] + C * data['Age'] + D * data['Age'] * data['Sex']

oldData['yo2'] = getOutcome2(oldData)
oldData['y2'] = oldData['yo2'] + np.random.normal(loc=0, scale=0.1, size=(Nold, ))
youngData['yo2'] = getOutcome1(youngData)
youngData['y2'] = youngData['yo2'] + np.random.normal(loc=0, scale=0.1, size=(Nold, ))

## Case 3: Genuine Imbalance
$$y = A\times Sex +$$
$$B\times T + $$
$$C\times Age + $$
$$D\times Age\times T$$

In [None]:
def getOutcome3(data):
    return A * data['Sex'] + B * data['T'] + C * data['Age'] + D * data['Age'] * data['T']

oldData['yo3'] = getOutcome3(oldData)
oldData['y3'] = oldData['yo3'] + np.random.normal(loc=0, scale=0.1, size=(Nold, ))
youngData['yo3'] = getOutcome3(youngData)
youngData['y3'] = youngData['yo3'] + np.random.normal(loc=0, scale=0.1, size=(Nold, ))

In [None]:
# Combine Both Young and Old
totalData = pd.concat([oldData, youngData], axis=0).sample(frac=1)
totalData

## Quick View

In [None]:
# some controlling parameters locally
doDraw = False
doSave = True

In [None]:
def countNumber(data, x, y):
    uniqueX, uniqueY = np.unique(data[x]), np.unique(data[y])
    countGrid = np.zeros((len(uniqueX), len(uniqueY)))
    for i in range(countGrid.shape[0]): # x length
        for j in range(countGrid.shape[1]): # y length
            countGrid[i, j] = np.sum(np.logical_and(data[x] == uniqueX[i], data[y] == uniqueY[j]))
    return countGrid

def drawGrid(data, x, y, title=None):
    countGrid = countNumber(data, x, y)
    fig, axs = plt.subplots(1, 1, figsize=(5, 5))
    plt.pcolormesh(countGrid.T, cmap="autumn", edgecolors='black')
    for i in range(countGrid.shape[0]):
        for j in range(countGrid.shape[1]):
            plt.text(i+0.5, j+0.5, int(countGrid[i, j]), va="center", ha="center", size=20)
    plt.xlabel(x, fontsize=15)
    plt.ylabel(y, fontsize=15)
    plt.xticks(np.unique(data[x])+0.5, np.unique(data[x]), fontsize=15)
    plt.yticks(np.unique(data[y])+0.5, np.unique(data[y]), fontsize=15)
    if title:
        plt.title(title, fontsize=15)
    plt.show()

In [None]:
if doDraw:
    drawGrid(oldData, 'Sex', 'T', title="Number Distribution of Old Sex-Treatment")
    drawGrid(youngData, 'Sex', 'T', title="Number Distribution of Young Sex-Treatment")
    drawGrid(totalData, 'Sex', 'T', title="Number Distribution of Total Sex-Treatment")

In [None]:
if doDraw:
    drawGrid(totalData, 'Age', 'Sex', title="Number Distribution of Total Age-Sex")
    drawGrid(totalData, 'Age', 'T', title="Number Distribution of Total Age-Treatment")

In [None]:
def drawOutcomesVersusRandom(data, yType, title):
    uniqueOutcome, countUnique = np.unique(data['yo'+yType], return_counts=True)

    fig, axs = plt.subplots(1, 1, figsize=(10, 10))
    for i in range(len(uniqueOutcome)):
        if np.sum([np.logical_and(data['T'] == 1, data['yo'+yType] == uniqueOutcome[i])]):
            VP = plt.violinplot(np.array(data['y'+yType][np.logical_and(data['T'] == 1, data['yo'+yType] == uniqueOutcome[i])]), \
                          positions=[uniqueOutcome[i]], widths=0.2)
            for pc in VP['bodies']:
                pc.set_facecolor('green')
                pc.set_edgecolor('green')
            for partname in ('cbars','cmins','cmaxes'):
                vp = VP[partname]
                vp.set_edgecolor('green')
                vp.set_facecolor('green')
                vp.set_linewidth(1)
            plt.text(x=uniqueOutcome[i], y=uniqueOutcome[i] + 1.1 * (np.max(data['y'+yType][data['yo'+yType] == uniqueOutcome[i]])-uniqueOutcome[i]), 
                     s=np.array(countUnique, dtype=str)[i], color='green', fontsize=15, ha='center')

        elif np.sum([np.logical_and(data['T'] == 0, data['yo'+yType] == uniqueOutcome[i])]):
            VP = plt.violinplot(np.array(data['y'+yType][np.logical_and(data['T'] == 0, data['yo'+yType] == uniqueOutcome[i])]), \
                          positions=[uniqueOutcome[i]], widths=0.2)
            for pc in VP['bodies']:
                pc.set_facecolor('red')
                pc.set_edgecolor('red')
            for partname in ('cbars','cmins','cmaxes'):
                vp = VP[partname]
                vp.set_edgecolor('red')
                vp.set_facecolor('red')
                vp.set_linewidth(1)
            plt.text(x=uniqueOutcome[i], y=uniqueOutcome[i] + 1.1 * (np.max(data['y'+yType][data['yo'+yType] == uniqueOutcome[i]])-uniqueOutcome[i]), 
                     s=np.array(countUnique, dtype=str)[i], color='red', fontsize=15, ha='center')
    plt.scatter(uniqueOutcome, uniqueOutcome, color='black', zorder=3)
    plt.xlabel('Without Noise', fontsize=15)
    plt.xticks(fontsize=15)
    plt.ylabel('With Noise', fontsize=15)
    plt.yticks(fontsize=15)
    if title:
        plt.title("Noise Effect | Case {} | {}".format(yType, title), fontsize=20)
    plt.show()

In [None]:
if doDraw:
    yType = '3'
    drawOutcomesVersusRandom(oldData, yType=yType, title='Old')
    drawOutcomesVersusRandom(youngData, yType=yType, title='Young')
    drawOutcomesVersusRandom(totalData, yType=yType, title='Total')

In [None]:
if doSave:
    np.savetxt('data/oldData.txt', oldData)
    np.savetxt('data/youngData.txt', youngData)
    np.savetxt('data/totalData.txt', totalData)

---

# NN Performance