In [1]:
import pandas as pd
import numpy as np

### Read data

In [2]:
dataset = pd.read_csv('dataset.csv')
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values
dataset.head()

Unnamed: 0,ParticipantID,HB,RBC,RET%,RDW-SD,RDW-CV,OFF-HR,LFR,IRF,MFR,HFR
0,1,14.21,5.07,0.89,41.3,13.46,86.21,91.94,8.03,7.16,0.81
1,1,13.96,4.56,0.96,42.46,12.74,81.24,93.08,7.0,6.39,0.51
2,1,14.72,5.13,0.85,41.64,13.16,92.67,89.95,10.04,9.4,0.71
3,1,13.89,4.79,0.92,44.04,13.86,81.53,86.74,13.27,11.79,1.33
4,1,15.42,4.77,1.07,43.28,12.47,92.19,94.42,5.58,4.71,0.75


### Split

In [3]:
# keep profiles separate between train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Create datapoints from train split

In [4]:
N = 20000 # datapoints
prof_max = 20 # maximum no of profiles together in one datapoint
train_len = len(X_train) # total number of individual profiles of patients
feat_len = len(X_train[0]) # number of original features

new_data = np.zeros((N, prof_max*feat_len+1))

for i in range(N):
    # random number of profiles
    n_prof = np.random.randint(2, prof_max)
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(train_len)
        # assign values
        new_data[i, 1+j*feat_len:1+(j+1)*feat_len] = X_train[idx]

        label_list.append(y_train[idx])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = not label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data[i, 0] = label

# concatenate to original    

In [5]:
new_data_df = pd.DataFrame(new_data)
new_data_df    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,1.0,14.16,4.50,1.81,40.94,12.18,60.41,86.59,13.67,11.84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,14.02,4.56,1.67,39.13,12.22,62.40,87.22,12.92,11.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,16.57,5.18,1.09,43.17,12.46,102.86,92.30,7.56,7.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,12.96,4.28,1.09,40.01,12.30,66.64,94.33,5.78,5.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,12.91,4.12,1.04,42.88,12.52,67.56,88.19,11.81,9.40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1.0,14.21,5.07,0.89,41.30,13.46,86.21,91.94,8.03,7.16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,1.0,15.72,5.29,1.84,37.52,12.23,73.92,88.80,11.24,10.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,1.0,15.15,4.73,0.94,43.41,12.58,93.76,94.73,5.30,4.71,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,1.0,15.76,5.23,0.96,39.73,12.60,98.77,94.73,5.21,4.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_part_cnt = new_data_df.groupby(0).count()
df_part_cnt.iloc[:,:1]

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,49
1.0,19951


### Observations
* The new dataset generated is skewed (data imbalance).
* There are very less instances of label=1 (all profile belongs to same class)
* The skewed is obvious because when selecting profiles the probability of selecting other patient profile is more

## Generate additional datapoints with 1 label to balance dataset

In [7]:
# id = np.array(y_train)
# idx = np.where(id == 25)[0]
# # print(len(y_train))
# print(idx)
# # print(y_train)

In [8]:
N_p = int(N*0.9) # datapoints
# N_p = 10
new_data_p = np.zeros((N_p, prof_max*feat_len+1))

for i in range(N_p):

    # random number of profiles
    n_prof = np.random.randint(2, prof_max)

    # random profiles selection
    p = np.random.randint(1, 31)
    
    # indexes of particular participants in data
    p_idx = np.where(np.array(y_train) == p)[0]
    t_len = len(p_idx)
    
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(t_len)
    
        # assign values
        new_data_p[i, 1+j*feat_len:1+(j+1)*feat_len] = X_train[p_idx[idx]]

        label_list.append(y_train[p_idx[idx]])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = not label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data_p[i, 0] = label


In [9]:
new_data_df_extra = pd.DataFrame(new_data_p)
new_data_df_extra    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,0.0,13.47,4.62,1.04,41.33,13.33,72.53,93.92,6.12,5.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,12.95,4.22,1.54,45.07,13.12,55.37,86.08,13.91,12.13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,14.02,4.56,1.67,39.13,12.22,62.40,87.22,12.92,11.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,14.18,4.87,1.36,38.95,12.48,72.66,90.14,9.84,8.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,12.62,4.72,0.66,46.28,15.34,77.47,89.48,10.57,8.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,0.0,15.96,4.87,0.94,42.43,12.15,103.52,98.26,1.70,2.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17996,0.0,12.89,4.15,0.99,41.52,12.46,69.62,95.47,4.55,4.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17997,0.0,13.76,4.47,1.27,40.83,12.36,70.30,91.35,8.77,7.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17998,0.0,14.49,4.89,0.92,39.06,12.24,87.49,93.85,6.13,5.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_part_cnt = new_data_df_extra.groupby(0).count()
df_part_cnt.iloc[:,:1]

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,18000


In [11]:
new_data.shape, new_data_p.shape

((20000, 201), (18000, 201))

In [12]:
new_data_final = np.vstack((new_data, new_data_p)) # merge original and extra data
new_data_df_final = pd.DataFrame(new_data_final) # create dataframe
new_data_df_final.groupby(0).count().iloc[:,:1] # support for each label


Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,18049
1.0,19951


In [13]:
# Save as CSV
new_data_df_final.to_csv('train_new.csv', index=None)

In [14]:
# check again
df_train = pd.read_csv('train_new.csv')
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,1.0,14.16,4.50,1.81,40.94,12.18,60.41,86.59,13.67,11.84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,14.02,4.56,1.67,39.13,12.22,62.40,87.22,12.92,11.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,16.57,5.18,1.09,43.17,12.46,102.86,92.30,7.56,7.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,12.96,4.28,1.09,40.01,12.30,66.64,94.33,5.78,5.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,12.91,4.12,1.04,42.88,12.52,67.56,88.19,11.81,9.40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37995,0.0,15.96,4.87,0.94,42.43,12.15,103.52,98.26,1.70,2.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37996,0.0,12.89,4.15,0.99,41.52,12.46,69.62,95.47,4.55,4.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37997,0.0,13.76,4.47,1.27,40.83,12.36,70.30,91.35,8.77,7.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37998,0.0,14.49,4.89,0.92,39.06,12.24,87.49,93.85,6.13,5.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Create datapoints from testset

In [15]:
N_t = int(N * 0.2) # datapoints
# prof_max = 5 # maximum no of profiles together in one datapoint
test_len = len(X_test) # total number of individual profiles of patients
feat_len = len(X_train[0]) # number of original features

new_data_t = np.zeros((N_t, prof_max*feat_len+1))

for i in range(N_t):
    # random number of profiles
    n_prof = np.random.randint(2, prof_max)
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(test_len)
        # assign values
        new_data_t[i, 1+j*feat_len:1+(j+1)*feat_len] = X_test[idx]

        label_list.append(y_test[idx])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = not label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data_t[i, 0] = label

# concatenate to original    

In [16]:
new_data_df_t = pd.DataFrame(new_data_t)
new_data_df_t    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,1.0,15.12,5.03,1.46,44.70,13.34,79.11,85.84,14.16,12.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,15.12,5.03,1.46,44.70,13.34,79.11,85.84,14.16,12.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,13.64,4.48,1.26,42.42,12.95,69.11,89.49,10.62,9.48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,12.95,4.28,1.60,45.67,13.53,53.43,88.44,11.56,10.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,15.75,5.02,0.98,39.77,12.24,98.91,93.10,6.80,5.42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1.0,13.55,4.40,2.03,47.83,13.97,49.70,86.44,13.50,11.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996,1.0,15.33,5.01,0.62,39.81,12.19,106.14,95.22,4.73,4.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3997,1.0,14.08,4.84,0.90,41.62,13.22,84.14,93.25,6.83,6.21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3998,1.0,15.09,4.91,0.90,41.22,12.53,93.75,95.02,5.00,4.39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_part_cnt = new_data_df_t.groupby(0).count()
df_part_cnt.iloc[:,:1]

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,7
1.0,3993


### Additional test points to make data balance

In [18]:
N_p = int(N_t*0.9) # datapoints
# N_p = 10
new_data_p = np.zeros((N_p, prof_max*feat_len+1))

for i in range(N_p):

    # random number of profiles
    n_prof = np.random.randint(2, prof_max)

    # random profiles selection
    p = np.random.randint(1, 31)
    
    # indexes of particular participants in data
    p_idx = np.where(np.array(y_test) == p)[0]
    t_len = len(p_idx)
    
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(t_len)
    
        # assign values
        new_data_p[i, 1+j*feat_len:1+(j+1)*feat_len] = X_test[p_idx[idx]]

        label_list.append(y_test[p_idx[idx]])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = not label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data_p[i, 0] = label


In [19]:
new_data_p.shape, new_data_p

((3600, 201), array([[ 0.  , 12.83,  4.26, ...,  0.  ,  0.  ,  0.  ],
        [ 0.  , 14.18,  4.76, ...,  0.  ,  0.  ,  0.  ],
        [ 0.  , 13.91,  4.55, ...,  0.  ,  0.  ,  0.  ],
        ...,
        [ 0.  , 13.64,  4.48, ...,  0.  ,  0.  ,  0.  ],
        [ 0.  , 15.02,  5.  , ...,  0.  ,  0.  ,  0.  ],
        [ 0.  , 13.89,  4.59, ...,  0.  ,  0.  ,  0.  ]]))

In [20]:
new_data_t_final = np.vstack((new_data_t, new_data_p)) # merge original and extra data
new_data_df_t_final = pd.DataFrame(new_data_t_final) # create dataframe
new_data_df_t_final.groupby(0).count().iloc[:,:1] # support for each label

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,3607
1.0,3993


In [21]:
# Save as CSV
new_data_df_t_final.to_csv('test_new.csv', index=None)