In [1]:
import pandas as pd
import numpy as np

### Read data

In [2]:
dataset = pd.read_csv('dataset.csv')
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values
dataset.head()

Unnamed: 0,ParticipantID,HB,RBC,RET%,RDW-SD,RDW-CV,OFF-HR,LFR,IRF,MFR,HFR
0,1,14.21,5.07,0.89,41.3,13.46,86.21,91.94,8.03,7.16,0.81
1,1,13.96,4.56,0.96,42.46,12.74,81.24,93.08,7.0,6.39,0.51
2,1,14.72,5.13,0.85,41.64,13.16,92.67,89.95,10.04,9.4,0.71
3,1,13.89,4.79,0.92,44.04,13.86,81.53,86.74,13.27,11.79,1.33
4,1,15.42,4.77,1.07,43.28,12.47,92.19,94.42,5.58,4.71,0.75


### Split

In [3]:
# keep profiles separate between train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Create datapoints from train split

In [4]:
N = 1000 # datapoints
prof_max = 5 # maximum no of profiles together in one datapoint
train_len = len(X_train) # total number of individual profiles of patients
feat_len = len(X_train[0]) # number of original features

new_data = np.zeros((N, prof_max*feat_len+1))

for i in range(N):
    # random number of profiles
    n_prof = np.random.randint(2, prof_max)
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(train_len)
        # assign values
        new_data[i, 1+j*feat_len:1+(j+1)*feat_len] = X_train[idx]

        label_list.append(y_train[idx])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data[i, 0] = label

# concatenate to original    

In [5]:
new_data_df = pd.DataFrame(new_data)
new_data_df    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,13.38,4.43,0.90,41.58,12.66,77.70,93.31,6.62,5.48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,14.81,4.86,0.75,41.55,12.54,96.32,94.67,5.31,4.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,13.78,4.54,0.86,40.69,12.42,81.59,98.22,1.83,1.51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,15.36,4.91,0.83,42.85,12.52,99.33,94.10,5.94,5.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,11.53,4.00,0.86,40.26,12.42,61.37,91.63,8.35,7.41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,15.55,4.97,1.28,38.46,11.82,87.24,93.74,6.35,5.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,13.35,4.36,0.76,40.79,12.35,81.18,94.94,5.06,4.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,13.69,4.31,1.60,41.05,12.14,60.16,85.20,14.81,13.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,14.82,5.13,1.17,39.92,12.73,82.36,86.28,13.74,12.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_part_cnt = new_data_df.groupby(0).count()
df_part_cnt.iloc[:,:1]

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,985
1.0,15


### Observations
* The new dataset generated is skewed (data imbalance).
* There are very less instances of label=1 (all profile belongs to same class)
* The skewed is obvious because when selecting profiles the probability of selecting other patient profile is more

### Generate datapoints with 1 label to balance dataset

In [7]:
id = np.array(y_train)
idx = np.where(id == 25)[0]
# print(len(y_train))
print(idx)
# print(y_train)

[ 10  17  26  53  85 110 114 117 149 182 207 223 230 241 268 284 291 301
 307 318 410 451 475 569 607 619 631 655]


In [8]:
N_p = int(N*0.4) # datapoints
# N_p = 10
new_data_p = np.zeros((N_p, prof_max*feat_len+1))

for i in range(N_p):

    # random number of profiles
    n_prof = np.random.randint(2, prof_max)

    # random profiles selection
    p = np.random.randint(1, 31)
    
    # indexes of particular participants in data
    p_idx = np.where(np.array(y_train) == p)[0]
    t_len = len(p_idx)
    
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(t_len)
    
        # assign values
        new_data_p[i, 1+j*feat_len:1+(j+1)*feat_len] = X_train[p_idx[idx]]

        label_list.append(y_train[p_idx[idx]])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data_p[i, 0] = label


In [9]:
new_data_df_extra = pd.DataFrame(new_data_p)
new_data_df_extra    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,1.0,12.92,4.34,0.87,39.42,12.23,74.11,95.00,4.96,4.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,15.88,5.16,2.24,40.81,12.89,63.93,85.41,14.70,11.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,16.39,5.42,2.17,40.83,13.31,72.89,84.62,15.42,13.41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,13.67,4.60,1.36,43.89,13.58,66.94,92.69,7.28,6.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,14.97,4.80,1.22,40.32,12.22,82.98,93.63,6.45,5.36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1.0,14.63,4.75,0.75,40.80,12.42,94.56,98.03,1.88,1.98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
396,1.0,14.27,4.73,1.37,43.26,13.15,74.34,91.61,8.33,7.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,1.0,14.07,4.85,1.00,40.53,13.16,80.43,94.14,5.81,4.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
398,1.0,14.18,4.95,1.24,40.75,13.05,75.41,92.33,7.40,7.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_part_cnt = new_data_df_extra.groupby(0).count()
df_part_cnt.iloc[:,:1]

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
1.0,400


In [11]:
new_data.shape, new_data_p.shape

((1000, 51), (400, 51))

In [12]:
new_data_final = np.vstack((new_data, new_data_p)) # merge original and extra data
new_data_df_final = pd.DataFrame(new_data_final) # create dataframe
new_data_df_final.groupby(0).count().iloc[:,:1] # support for each label


Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,985
1.0,415


In [13]:
# Save as CSV
new_data_df_final.to_csv('train_new.csv', index=None)

In [14]:
# check again
df_train = pd.read_csv('train_new.csv')
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,13.38,4.43,0.90,41.58,12.66,77.70,93.31,6.62,5.48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,14.81,4.86,0.75,41.55,12.54,96.32,94.67,5.31,4.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,13.78,4.54,0.86,40.69,12.42,81.59,98.22,1.83,1.51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,15.36,4.91,0.83,42.85,12.52,99.33,94.10,5.94,5.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,11.53,4.00,0.86,40.26,12.42,61.37,91.63,8.35,7.41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,1.0,14.63,4.75,0.75,40.80,12.42,94.56,98.03,1.88,1.98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1396,1.0,14.27,4.73,1.37,43.26,13.15,74.34,91.61,8.33,7.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1397,1.0,14.07,4.85,1.00,40.53,13.16,80.43,94.14,5.81,4.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1398,1.0,14.18,4.95,1.24,40.75,13.05,75.41,92.33,7.40,7.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create datapoints from testset

In [15]:
N_t = int(N * 0.5) # datapoints
prof_max = 5 # maximum no of profiles together in one datapoint
test_len = len(X_test) # total number of individual profiles of patients
feat_len = len(X_train[0]) # number of original features

new_data_t = np.zeros((N_t, prof_max*feat_len+1))

for i in range(N_t):
    # random number of profiles
    n_prof = np.random.randint(2, prof_max)
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(test_len)
        # assign values
        new_data_t[i, 1+j*feat_len:1+(j+1)*feat_len] = X_test[idx]

        label_list.append(y_test[idx])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data_t[i, 0] = label

# concatenate to original    

In [16]:
new_data_df_t = pd.DataFrame(new_data_t)
new_data_df_t    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,15.39,5.03,1.11,42.77,12.90,91.04,91.63,8.45,6.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,13.74,4.50,0.64,41.47,12.52,90.31,98.84,1.13,1.28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,14.42,4.67,1.84,42.79,13.10,62.03,89.58,10.60,9.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,14.04,4.59,1.15,40.59,12.33,76.37,94.24,5.80,5.16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,12.87,4.16,1.29,43.58,12.80,60.56,86.43,13.59,12.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,14.09,4.59,0.84,39.68,11.98,86.25,97.12,2.92,2.88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.0,14.60,4.72,0.79,40.56,12.14,93.15,94.78,5.22,4.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,14.23,4.72,1.69,42.81,13.20,64.99,89.86,10.30,8.66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,0.0,13.26,4.49,0.68,42.67,13.39,81.89,90.67,9.26,8.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_part_cnt = new_data_df_t.groupby(0).count()
df_part_cnt.iloc[:,:1]

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,487
1.0,13


### Additional test points to make data balance

In [18]:
N_p = int(N_t*0.4) # datapoints
# N_p = 10
new_data_p = np.zeros((N_p, prof_max*feat_len+1))

for i in range(N_p):

    # random number of profiles
    n_prof = np.random.randint(2, prof_max)

    # random profiles selection
    p = np.random.randint(1, 31)
    
    # indexes of particular participants in data
    p_idx = np.where(np.array(y_test) == p)[0]
    t_len = len(p_idx)
    
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(t_len)
    
        # assign values
        new_data_p[i, 1+j*feat_len:1+(j+1)*feat_len] = X_test[p_idx[idx]]

        label_list.append(y_test[p_idx[idx]])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data_p[i, 0] = label


In [19]:
new_data_p.shape, new_data_p

((200, 51), array([[ 1.  , 14.97,  5.24, ...,  0.  ,  0.  ,  0.  ],
        [ 1.  , 14.17,  4.65, ...,  0.  ,  0.  ,  0.  ],
        [ 1.  , 15.39,  5.03, ...,  0.  ,  0.  ,  0.  ],
        ...,
        [ 1.  , 13.75,  4.54, ...,  0.  ,  0.  ,  0.  ],
        [ 1.  , 15.34,  5.07, ...,  0.  ,  0.  ,  0.  ],
        [ 1.  , 14.51,  4.96, ...,  0.  ,  0.  ,  0.  ]]))

In [20]:
new_data_t_final = np.vstack((new_data_t, new_data_p)) # merge original and extra data
new_data_df_t_final = pd.DataFrame(new_data_t_final) # create dataframe
new_data_df_t_final.groupby(0).count().iloc[:,:1] # support for each label

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0.0,487
1.0,213


In [21]:
# Save as CSV
new_data_df_t_final.to_csv('test_new.csv', index=None)