In [1]:
import pandas as pd
import numpy as np

### Read data

In [2]:
dataset = pd.read_csv('dataset.csv')
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values
dataset.head()

Unnamed: 0,ParticipantID,HB,RBC,RET%,RDW-SD,RDW-CV,OFF-HR,LFR,IRF,MFR,HFR
0,1,14.21,5.07,0.89,41.3,13.46,86.21,91.94,8.03,7.16,0.81
1,1,13.96,4.56,0.96,42.46,12.74,81.24,93.08,7.0,6.39,0.51
2,1,14.72,5.13,0.85,41.64,13.16,92.67,89.95,10.04,9.4,0.71
3,1,13.89,4.79,0.92,44.04,13.86,81.53,86.74,13.27,11.79,1.33
4,1,15.42,4.77,1.07,43.28,12.47,92.19,94.42,5.58,4.71,0.75


### Split

In [3]:
# keep profiles separate between train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Create datapoints from train split

In [23]:
N = 1000 # datapoints
prof_max = 5 # maximum no of profiles together in one datapoint
train_len = len(X_train) # total number of individual profiles of patients
feat_len = len(X_train[0]) # number of original features

new_data = np.zeros((N, prof_max*feat_len+1))

for i in range(N):
    # random number of profiles
    n_prof = np.random.randint(2, prof_max)
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(train_len)
        # assign values
        new_data[i, 1+j*feat_len:1+(j+1)*feat_len] = X_train[idx]

        label_list.append(y_train[idx])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data[i, 0] = label

# concatenate to original    

In [24]:
new_data_df = pd.DataFrame(new_data)
new_data_df    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,15.74,5.48,0.38,37.22,12.40,118.24,97.41,2.64,2.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,15.72,5.04,1.03,38.40,11.85,96.31,91.85,8.21,7.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,12.91,4.12,1.04,42.88,12.52,67.56,88.19,11.81,9.40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,12.66,4.20,0.92,40.47,12.39,69.24,92.74,7.18,6.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,15.20,5.10,0.96,42.18,12.83,93.26,87.89,12.08,10.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,16.12,5.29,2.14,40.99,13.28,71.49,89.61,10.54,8.71,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,14.60,4.92,1.26,40.22,12.46,79.61,93.70,6.40,5.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,15.99,5.25,1.60,39.55,12.49,81.48,87.58,12.48,10.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,14.35,4.52,1.60,44.63,12.73,68.67,85.96,14.06,11.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Save as CSV
new_data_df.to_csv('train_new.csv', index=None)

In [26]:
# check again
df_train = pd.read_csv('train_new.csv')
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,15.74,5.48,0.38,37.22,12.4,118.24,97.41,2.64,2.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,15.72,5.04,1.03,38.4,11.85,96.31,91.85,8.21,7.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,12.91,4.12,1.04,42.88,12.52,67.56,88.19,11.81,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,12.66,4.2,0.92,40.47,12.39,69.24,92.74,7.18,6.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,15.2,5.1,0.96,42.18,12.83,93.26,87.89,12.08,10.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create datapoints from testset

In [29]:
N_t = int(N * 0.5) # datapoints
prof_max = 5 # maximum no of profiles together in one datapoint
test_len = len(X_test) # total number of individual profiles of patients
feat_len = len(X_train[0]) # number of original features

new_data_t = np.zeros((N_t, prof_max*feat_len+1))

for i in range(N_t):
    # random number of profiles
    n_prof = np.random.randint(2, prof_max)
    # print('No of profiles', n_prof)
    label_list = [] # for assigning label
    for j in range(n_prof):
        # select profile randomly
        idx = np.random.randint(test_len)
        # assign values
        new_data_t[i, 1+j*feat_len:1+(j+1)*feat_len] = X_test[idx]

        label_list.append(y_test[idx])
    
    # fill values for remaining
    # for j in range(prof_max - n_prof):
    # print('labels', label_list)
    # check if all values are identical in list
    label = label_list.count(label_list[0]) == len(label_list)
    # Assign label
    new_data_t[i, 0] = label

# concatenate to original    

In [30]:
new_data_df_t = pd.DataFrame(new_data_t)
new_data_df_t    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,15.27,4.94,0.63,41.03,12.27,104.88,96.67,3.27,3.13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,13.18,4.38,0.62,43.28,13.42,83.16,95.59,4.30,4.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,13.18,4.10,1.54,42.14,12.36,56.66,91.14,9.00,7.41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,14.09,4.59,0.84,39.68,11.98,86.25,97.12,2.92,2.88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,13.12,4.27,1.27,44.43,13.91,63.54,93.69,6.25,5.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,15.33,5.01,0.62,39.81,12.19,106.14,95.22,4.73,4.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.0,11.82,4.14,0.84,45.52,14.38,63.90,92.54,7.34,6.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,12.27,3.95,1.07,42.42,12.52,59.95,92.80,7.00,6.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,0.0,12.83,4.26,0.84,40.16,12.41,73.61,93.56,6.57,6.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Save as CSV
new_data_df_t.to_csv('test_new.csv', index=None)