# LSTM_RNN
### 6M|ALL
#### 4D|ALL
- Imputation: 
    - No Imputaition for the target EDSS
    - Interpolation for the rest of features
- Features: All features
- Time Steps: 34
- Evaluation: MSE 

In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import tensorflow as tf
import sklearn as sk
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot as plt

  from ._conv import register_converters as _register_converters


In [2]:
def select_columns(col_list, n_months):
    
    """takes in a list of column names and number of visits starting at 0
    returns column list time-stepped and dovetailed""" 
    
    return dovetail_names(*[time_step_names(i, n_months) for i in col_list])
        
def time_step_names(name, n_months):

    return [(name + '_%d' % (j+1)) for j in range(-1,n_months*6, 6)]

def dovetail_names(*kwargs):
    zipped = zip(*kwargs)
    l = []
    for i in zipped:
        for j in i:
            l.append(j)
    return l

def stretch_input(Xtr, n_inputs, time_steps, pot):

    """Xtr_fill is empty 3D numpy array where we extend length of patient observation times t
    pot stands for Patient Observation Time. We only need to do this for our X input"""
    
    Xtr_fill = np.zeros(shape=[Xtr.shape[0],time_steps,n_inputs*pot] , dtype = object) 

    for subject in range(Xtr.shape[0]):
    
        for i in range(time_steps):

            temp = np.concatenate([Xtr[subject][i],Xtr[subject][i+1],Xtr[subject][i+2],Xtr[subject][i+3]]) # changed for pot = 3
            Xtr_fill[subject][i] = temp
            
    return Xtr_fill

def stack_times(data, name, n):
    
    
    """takes in dataframe, column name and n of time steps
    and puts it in long format"""
    
    all_names = select_columns(name, n-1)
    
    l = []
    
    for col in all_names:
        l.append(data[col].copy())
    
    stacked = l[0]
    rest = l[1:]
    
    # stack Series and get dummy variables 
    stacked.append(rest)
    
    return stacked

def stack_dummy(data, name, n):
    
    
    """takes in dataframe and column name
    return that same feature split into dummy columns
    across n time steps (adjacent)"""
    
    all_names = select_columns(name, n-1)
    
    l = []
    
    for col in all_names:
        l.append(data[col].copy())
    
    f = l[0]
    rest = l[1:]
    
    # stack Series and get dummy variables 
    pre_dummy = pd.get_dummies(f.append(rest))
    
    after_dummy = time_dummy(pre_dummy, n)
    
    dummy_value_names = generate_col_names(after_dummy, name)
    time_stepped_dummy_names = time_step_dummy_value_names(dummy_value_names, n)
    
    for t in range(len(after_dummy)):
        
        after_dummy[t].columns = list(time_stepped_dummy_names[t])
        
    #untimed_names_to_order = column_names_per_time_step(col_names_together, "what are you", name[0])
    #names_to_order = select_columns(untimed_names_to_order, n-1)

    return pd.concat(after_dummy, axis = 1, sort = False), dummy_value_names


def time_dummy(dummy_df, n):
    
    """Separates long data frame into time steps 
    (508 subjects (rows) per time step)"""
    
    l = []
    for i in range(n):
        l.append(dummy_df.iloc[i*508:(i+1)*508,:].copy())
    
    return l

def generate_col_names(after_dummy, name):
    
    """Generates column names for result of pd.get_dummies on a feature
    i.e. if A has values x and y, it will generate A_x, A_y"""
    
    return [(str(name[0]) + "_" + str(list(after_dummy[0].columns)[i])) for i in range(len(list(after_dummy[0].columns)))]

def time_step_dummy_value_names(names, n_months):
    
    long_list = [(name + '_%d' % (j+1)) for j in range(-1,n_months*6, 6) for name in names]
    return np.array(long_list).reshape(-1, len(names))

def add_columns(add_to, name, names_per_t, n):
    
    n = n + 1
    to_add, bare_names = stack_dummy(df, name, n)
    to_remove = select_columns(name, n-1)
    
    """add new dummied features to dataframes (copy)
    and remove undummied version of features
    name is a list
    
    encompasses stack_dummy"""
    
    newdf = add_to.copy()
    column_names = list(to_add.columns)
    
    for i in range(len(column_names)):
        newdf[column_names[i]] = to_add.iloc[:,i]
    newdf.drop(to_remove,axis = 1, inplace = True)
    
    #print(bare_names, name[0])
    
    names_per_t_updated = column_names_per_time_step(names_per_t, bare_names, name[0])
    namesOrder= select_columns(names_per_t_updated, n-1)
    return newdf[namesOrder].copy(), names_per_t_updated

#    return names_per_t_updated
#     print(name[0])
    
    
    return newdf[namesOrder].copy(), names_per_t_updated


def column_names_per_time_step(original_list, add, remove):
    """makes sure EDSS stays at the end
    remove pre """
    
    new_list = original_list.copy()
    
    
    new_list.remove(remove)
    new_list.extend(add)
    
    # makes sure EDSS is always last
    
    new_list.remove('EDSS')
    new_list.append('EDSS')
    
    return new_list

def manual_dummy(df, names, name_list, n):
    
    dfUpdated =df.copy()
    names = [[name] for name in names] # turn to list foramt so that it works
    
    for name in names:
        
        dfUpdated, name_list = add_columns(dfUpdated, name , name_list, n)
   
    return dfUpdated # should I return name_list as well?


# IMPORT DATA & DEFINE N OF TIME STEPS

In [3]:
dataset = pd.read_csv("newdata_11.22.csv", index_col = 0)
dfForColumnNames = pd.read_csv("zeroImputed_dfByVisitDate_21-11.csv", index_col = 0)

"""DEFINE N"""
n = 37 
# n determines the amount of time steps in the data after 0, (n = 1) == (2 data points), (n = 3) == (4 data points)
# i.e. n = 2 means we have three time steps t0, t1, t2, which together encompass one year of patient observation
# n = 5 means we have 6 time steps (t0...t5), which encompass 2 year and half of patient observation 

"""DEFINE POT"""
pot = 1 # patient observation length 

### Fixed feature columns 

In [4]:
fixed_feature_column_names = list(dataset.iloc[:, 1:24].columns)

print("We have", len(fixed_feature_column_names), "fixed features")

timed_ff_names = select_columns(fixed_feature_column_names,n-1) # for x variable 

We have 23 fixed features


### Time sensitive columns 

In [5]:
timed_feature_column_names = list(dfForColumnNames.iloc[:,2:].columns)
timed_feature_column_names.remove('EDSS')
timed_feature_column_names.extend(['EDSS']) # position EDSS at the end always for tensorflow loss function 

print("We have", len(timed_feature_column_names), "timed features")

We have 45 timed features


### Join fixed features and time sensitive columns lists

Join fixed features and time sensitive columns and generate the appropriate column list 
where each time step contains both fixed features and timed features
the caveat being that fixed features retain a constant value across time steps
whereas time sensitive features do not 

In [6]:
col_names_together = fixed_feature_column_names.copy()
col_names_together.extend(timed_feature_column_names)

col_names_together_timed = select_columns(col_names_together, n)

print(len(col_names_together), "total amount of features before one-hot-encoding",
     "spread across", n, "timesteps generate", len(col_names_together_timed), "columns")

68 total amount of features before one-hot-encoding spread across 37 timesteps generate 2584 columns


### Add fixed feature values to the data set 

In [7]:
df_fixed = dataset[fixed_feature_column_names].copy() 
df_fixed_columns_with_nans = df_fixed.columns[df_fixed.isna().any()].tolist()
df_fixed_columns_with_nans
print(len(df_fixed_columns_with_nans)/len(df_fixed.columns)*100, "%")
print(df_fixed_columns_with_nans)

# (mode)impute for missing fixed values by column mode
df_fixed.fillna(df_fixed.mode().iloc[0], axis = 0, inplace = True) 

26.08695652173913 %
['SIBLINGS', 'BIRTH_ORDER', 'EDUCATION_DEGREE', 'EDUCATION_DEGREE_DESC', 'CHILD', 'DOMINANT_HAND']


In [8]:
m_dataset = dataset.copy()

# assign the same value at each time step for the fixed feature columns
# which are already appropriately interleaved by time in the X space 
for initial_name in fixed_feature_column_names:
    for col_name in select_columns([initial_name], n):
        m_dataset[col_name] = df_fixed[initial_name]# .fillna(0, inplace=True) # replace all missing values as 0

# Fill in EDSS simple - Needs Review

will require interpolation within training loop later one

In [9]:
# filter data set to timed columns, both fixed and time sensitive 
df = m_dataset[col_names_together_timed].copy()

# Y to 1 replacement: Categorical
y_replacement_categorical = ["IMAGES", "BRAIN", "BRAIN_T2","BRAIN_GAD","BRAIN_UNCHANGED","SPINE","TSPINE","CSPINE","LSPINE","USPINE","SPINE_T2","SPINE_GAD","SPINE_UNCHANGED"]

for column in y_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('Y', 1, inplace=True)
        #df[col].fillna(df.mode().iloc[0], inplace = True)

# N to 0 replacement: Categorical
n_replacement_categorical = ["AMBULATORY_INDEX"]

for column in n_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('N', -1, inplace=True)
        #df[col].fillna(df.mode().iloc[0], inplace = True)
        
# Y to 1, N to 0 replacement: Categorical 
yn_replacement_categorical = ["AMPYRA", "NORMAL_BRAIN", "NORMAL_SPINE","SMOKING_HISTORY"]

for column in yn_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('Y', 1, inplace=True)
        df[col].replace('N', -1, inplace=True)
        #df[col].fillna(df.mode().iloc[0], inplace = True)

# R to 1, L to -1 replacement: Categorical 
rl_replacement_categorical = ["DOMINANT_HAND"]

for column in rl_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('R', 1, inplace=True)
        df[col].replace('L', -1, inplace=True)
        df[col].replace('A', 0, inplace=True)
        #df[col].fillna(df.mode().iloc[0], inplace = True)

# R to 1, L to -1 replacement: Categorical 
mf_replacement_categorical = ["SEX"]

for column in mf_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('M', 1, inplace=True) # like Tom
        df[col].replace('F', -1, inplace=True) # like Tom
        df[col].replace('O', 0, inplace=True) # like Tom
        #df[col].fillna(df.mode().iloc[0], inplace = True)

# X to 0 replacement categorical
x_replacement_categorical = ["CEREBELLAR_FUNCTION","BRAINSTEM_FUNCTION","SENSORY_FUNCTION","BOWEL_BLADDER_FUNCTION","VISUAL_FUNCTION", "FEET25_ASSISTANCE"]

for column in x_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('X', 0, inplace=True)
        #df[col].fillna(df.mode().iloc[0], inplace = True)

# X to 0 replacement categorical
p_replacement_categorical = ["PROTOCOL"]

for column in p_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('3T', 0, inplace=True)
        #df[col].fillna(df.mode().iloc[0], inplace = True)

# X to 0 replacement categorical
g_replacement_categorical = ["T3_BPF"]

for column in g_replacement_categorical:
    for col in select_columns([column], n):
        df[col].replace('G', 0, inplace=True)
        df[col].replace('B', 0, inplace=True)
        #df[col].fillna(df.mode().iloc[0], inplace = True)
        

# X to 0 replacement categorical
dob_replacement_categorical = ["DOB_YEAR"]

for column in dob_replacement_categorical:
    for col in select_columns([column], n):
        df[col] = (df[col]-1900)/10
        #df[col].fillna(df.mode().iloc[0], inplace = True)
        
#df.replace('O', 0, inplace=True)

### Drop all columns ending with "_desc"

- "RACE_DESC" == "RACE"
- "ETHNICITY_DESC" == "ETHNICITY"
- "EDUCATION_DEGREE_DESC" == "EDUCATION_DEGREE"
- "MARITAL_DESC" ==  "MARITAL"
- "TWIN_DESC" == "TWIN"
- "AUTOIMM_DESC" 
- "FAMILY_MS_DESC" 
- "FAMILY_AI_DESC"

In [10]:
columns_to_drop = ["RACE_DESC", "ETHNICITY_DESC", "EDUCATION_DEGREE_DESC", 
                   "MARITAL_DESC", "TWIN_DESC", "AUTOIMM_DESC", 
                   "FAMILY_MS_DESC", "FAMILY_AI_DESC"]
dfPreDummy = df.drop(columns = select_columns(columns_to_drop, n)).copy()

fillin_df = dfPreDummy.copy()

### Columns to dummy

- "MARITAL"
- "AUTOMIMMUN"
- "FAMILY_MS" 
- "FAMILY_AI" 
- "ATTACK" 
- "STATUS" 
- "PROTOCOL"

In [11]:
columns_to_dummy = ["RACE","MARITAL","AUTOMIMMUN","FAMILY_MS", "FAMILY_AI", "ATTACK", "STATUS"]
initial_column_list = [str(i)[:-2] for i in dfPreDummy.columns[:60]]

In [12]:
all_cols = [str(x)[:-2] for x in dfPreDummy.columns[:110]]

In [13]:
columns_to_interpolate = [x for x in all_cols if x not in 
                          [x for x in df_fixed.columns.tolist() if x not in columns_to_drop]]

columns_to_interpolate.remove('EDSS')

In [14]:
def impute_mode_across_time(df, col):
    
    df_copy = df.copy()
    to_impute = df_copy[select_columns([col], n)].copy()
    to_impute = to_impute.T.fillna(to_impute.mode(axis=1)[0]).T
    
    for col in to_impute.columns:
        df_copy[col] = to_impute[col] 
        
    return df_copy

m1 = impute_mode_across_time(fillin_df, 'ATTACK')

In [15]:
final = impute_mode_across_time(fillin_df, 'ATTACK')

for i in range(1, len(columns_to_interpolate)):
    
    final = impute_mode_across_time(final, columns_to_interpolate[i])
    
final.columns[final.isna().any()].tolist()[0]

  warn("Unable to sort modes: {error}".format(error=e))
  warn("Unable to sort modes: {error}".format(error=e))
  warn("Unable to sort modes: {error}".format(error=e))


'EDSS_0'

# Dummy the Data

In [16]:
updatedDf = manual_dummy(final, columns_to_dummy, initial_column_list, n)

n_inputs_pure = list(updatedDf.columns).index('EDSS_0')+1
print(n_inputs_pure, list(updatedDf.columns)[n_inputs_pure-1])
untimed_input_names = [str(x)[:-2] for x in updatedDf.columns[:110]] #np.flaot 
print(updatedDf.iloc[:,:-110].shape[1]/n)

print("Before dummification, there are" , len(list(dfPreDummy.columns)[:60]), "features per time step")

#fillin_df = updatedDf.copy()

110 EDSS_0
110.0
Before dummification, there are 60 features per time step


# Saving Data in Different Time Arrangements 

### 6m|all

In [22]:
def generate_data_6m(sc):
    
    sc2 = sc + 1
    sc3 = sc
    
    nanY = updatedDf[select_columns(['EDSS'], n)].copy()
    interpolateY = updatedDf[select_columns(['EDSS'], 0)].copy() # only interpolate for t0
    interpolateY.interpolate(axis = 1, limit_direction = "both", inplace = True)
    interpolateX = interpolateY.drop(list(interpolateY.columns)[-sc2:],axis = 1).copy()

    X = updatedDf.iloc[:,:(n_inputs_pure*(n-sc))].copy()

    for col in interpolateX.columns:
        X[col] = interpolateX[col] 

    for col in interpolateY.columns:
        nanY[col] = interpolateY[col] 

    y = nanY.drop(select_columns(['EDSS'], sc3), axis = 1).copy() 

    X.replace('O', 0, inplace = True)
    
    mask = X.index.isin(X[X["EDSS_0"].notnull()].index.tolist())
    
    X, y = X[mask], y[mask]
    print(X.shape, y.shape)
    print("First time sliceof X ends at ", X.columns.tolist()[109])
    print("First y column is", y.columns.tolist()[0])

    print("Last column for X is ",X.columns.tolist()[-1])
    print("Last y column is ", y.columns.tolist()[-1])
    
    X.reset_index(inplace=True, drop = True)
    y.reset_index(inplace=True, drop = True)
    
    return X,y 

In [23]:
X, y = generate_data_6m(0)

X.to_csv("../limit_interp_data/X_6_months|6_months_limited_interpolation.csv")
y.to_csv("../limit_interp_data/y_6_months|6_months_limited_interpolation.csv")


(491, 4070) (491, 37)
First time sliceof X ends at  EDSS_0
First y column is EDSS_6
Last column for X is  EDSS_216
Last y column is  EDSS_222


In [274]:
X, y = generate_data_6m(1)

X.to_csv("../data/X_6_months|1_year_exhaustive.csv")
y.to_csv("../data/y_6_months|1_year_exhaustive.csv")

X, y = generate_data_6m(2)

X.to_csv("../data/X_6_months|1.5_years_exhaustive.csv")
y.to_csv("../data/y_6_months|1.5_years_exhaustive.csv")

X, y = generate_data_6m(3)

X.to_csv("../data/X_6_months|2_years_exhaustive.csv")
y.to_csv("../data/y_6_months|2_years_exhaustive.csv")

X, y = generate_data_6m(4)

X.to_csv("../data/X_6_months|2.5_years_exhaustive.csv")
y.to_csv("../data/y_6_months|2.5_years_exhaustive.csv")

X, y = generate_data_6m(5)

X.to_csv("../data/X_6_months|3_years_exhaustive.csv")
y.to_csv("../data/y_6_months|3_years_exhaustive.csv")

X, y = generate_data_6m(6)

X.to_csv("../data/X_6_months|3.5_years_exhaustive.csv")
y.to_csv("../data/y_6_months|3.5_years_exhaustive.csv")

X, y = generate_data_6m(7)

X.to_csv("../data/X_6_months|4_years_exhaustive.csv")
y.to_csv("../data/y_6_months|4_years_exhaustive.csv")

X, y = generate_data_6m(8)

X.to_csv("../data/X_6_months|4.5_years_exhaustive.csv")
y.to_csv("../data/y_6_months|4.5_years_exhaustive.csv")

X, y = generate_data_6m(9)

X.to_csv("../data/X_6_months|5_years_exhaustive.csv")
y.to_csv("../data/y_6_months|5_years_exhaustive.csv")

(508, 4070) (508, 37)
First time sliceof X ends at  EDSS_0
First y column is EDSS_6
Last column for X is  EDSS_216
Last y column is  EDSS_222
(508, 3960) (508, 36)
First time sliceof X ends at  EDSS_0
First y column is EDSS_12
Last column for X is  EDSS_210
Last y column is  EDSS_222
(508, 3850) (508, 35)
First time sliceof X ends at  EDSS_0
First y column is EDSS_18
Last column for X is  EDSS_204
Last y column is  EDSS_222
(508, 3740) (508, 34)
First time sliceof X ends at  EDSS_0
First y column is EDSS_24
Last column for X is  EDSS_198
Last y column is  EDSS_222
(508, 3630) (508, 33)
First time sliceof X ends at  EDSS_0
First y column is EDSS_30
Last column for X is  EDSS_192
Last y column is  EDSS_222
(508, 3520) (508, 32)
First time sliceof X ends at  EDSS_0
First y column is EDSS_36
Last column for X is  EDSS_186
Last y column is  EDSS_222
(508, 3410) (508, 31)
First time sliceof X ends at  EDSS_0
First y column is EDSS_42
Last column for X is  EDSS_180
Last y column is  EDSS_222


# 1 year | All

In [19]:
def generate_data_1y(sc):
    
    sc2 = sc - 1
    sc3 = sc 

    nanY = updatedDf[select_columns(['EDSS'], n)].copy()
    interpolateY = updatedDf[select_columns(['EDSS'], n)].copy()
    interpolateY.interpolate(axis = 1, limit_direction = "both", inplace = True)
    interpolateX = interpolateY.drop(list(interpolateY.columns)[-sc:],axis = 1).copy()

    X = updatedDf.iloc[:,:(n_inputs_pure*(n-sc2))].copy()
    y = nanY.drop(select_columns(['EDSS'], sc3), axis = 1).copy()
    print(X.shape, y.shape) 

    for col in interpolateX.columns:
        X[col] = interpolateX[col] 

    X.replace('O', 0, inplace = True)
    print("First time sliceof X ends at ", X.columns.tolist()[109])
    print("First y column is", y.columns.tolist()[0])

    print("Last column for X is ",X.columns.tolist()[-1])
    print("Last y column is ", y.columns.tolist()[-1])
    
    return X, y


In [20]:
X, y = generate_data_1y(1)

X.to_csv("../data/X_1_year|6_months_exhaustive.csv")
y.to_csv("../data/y_1_year|6_months_exhaustive.csv")

X, y = generate_data_1y(2)

X.to_csv("../data/X_1_year|1_year_exhaustive.csv")
y.to_csv("../data/y_1_year|1_year_exhaustive.csv")

X, y = generate_data_1y(3)

X.to_csv("../data/X_1_year|1.5_years_exhaustive.csv")
y.to_csv("../data/y_1_year|1.5_years_exhaustive.csv")

X, y = generate_data_1y(4)

X.to_csv("../data/X_1_year|2_years_exhaustive.csv")
y.to_csv("../data/y_1_year|2_years_exhaustive.csv")

X, y = generate_data_1y(5)

X.to_csv("../data/X_1_year|2.5_years_exhaustive.csv")
y.to_csv("../data/y_1_year|2.5_years_exhaustive.csv")

X, y = generate_data_1y(6)

X.to_csv("../data/X_1_year|3_years_exhaustive.csv")
y.to_csv("../data/y_1_year|3_years_exhaustive.csv")

X, y = generate_data_1y(7)

X.to_csv("../data/X_1_year|3.5_years_exhaustive.csv")
y.to_csv("../data/y_1_year|3.5_years_exhaustive.csv")

X, y = generate_data_1y(8)

X.to_csv("../data/X_1_year|4_years_exhaustive.csv")
y.to_csv("../data/y_1_year|4_years_exhaustive.csv")

X, y = generate_data_1y(9)

X.to_csv("../data/X_1_year|4.5_years_exhaustive.csv")
y.to_csv("../data/y_1_year|4.5_years_exhaustive.csv")


X, y = generate_data_1y(10)

X.to_csv("../data/X_1_year|5_years_exhaustive.csv")
y.to_csv("../data/y_1_year|5_years_exhaustive.csv")


(508, 4070) (508, 36)
First time sliceof X ends at  EDSS_0
First y column is EDSS_12
Last column for X is  EDSS_216
Last y column is  EDSS_222
(508, 3960) (508, 35)
First time sliceof X ends at  EDSS_0
First y column is EDSS_18
Last column for X is  EDSS_210
Last y column is  EDSS_222
(508, 3850) (508, 34)
First time sliceof X ends at  EDSS_0
First y column is EDSS_24
Last column for X is  EDSS_204
Last y column is  EDSS_222
(508, 3740) (508, 33)
First time sliceof X ends at  EDSS_0
First y column is EDSS_30
Last column for X is  EDSS_198
Last y column is  EDSS_222
(508, 3630) (508, 32)
First time sliceof X ends at  EDSS_0
First y column is EDSS_36
Last column for X is  EDSS_192
Last y column is  EDSS_222
(508, 3520) (508, 31)
First time sliceof X ends at  EDSS_0
First y column is EDSS_42
Last column for X is  EDSS_186
Last y column is  EDSS_222
(508, 3410) (508, 30)
First time sliceof X ends at  EDSS_0
First y column is EDSS_48
Last column for X is  EDSS_180
Last y column is  EDSS_222

### 1.5 years|All

In [285]:
def generate_data_1y6m(sc):
    
    sc2 = sc - 1
    sc3 = sc + 1

    nanY = updatedDf[select_columns(['EDSS'], n)].copy()
    interpolateY = updatedDf[select_columns(['EDSS'], n)].copy()
    interpolateY.interpolate(axis = 1, limit_direction = "both", inplace = True)
    interpolateX = interpolateY.drop(list(interpolateY.columns)[-sc:],axis = 1).copy()

    X = updatedDf.iloc[:,:(n_inputs_pure*(n-sc2))].copy()
    y = nanY.drop(select_columns(['EDSS'], sc3), axis = 1).copy()
    print(X.shape, y.shape) 

    for col in interpolateX.columns:
        X[col] = interpolateX[col] 

    X.replace('O', 0, inplace = True)
    print("First time sliceof X ends at ", X.columns.tolist()[109])
    print("First y column is", y.columns.tolist()[0])

    print("Last column for X is ",X.columns.tolist()[-1])
    print("Last y column is ", y.columns.tolist()[-1])
    
    return X, y


In [289]:
X, y = generate_data_1y6m(1)

X.to_csv("../data/X_1.5_years|6_months_exhaustive.csv")
y.to_csv("../data/y_1.5_years|6_months_exhaustive.csv")

X, y = generate_data_1y6m(2)

X.to_csv("../data/X_1.5_years|1_year_exhaustive.csv")
y.to_csv("../data/y_1.5_years|1_year_exhaustive.csv")

X, y = generate_data_1y6m(3)

X.to_csv("../data/X_1.5_years|1.5_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|1.5_years_exhaustive.csv")

X, y = generate_data_1y6m(4)

X.to_csv("../data/X_1.5_years|2_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|2_years_exhaustive.csv")


X, y = generate_data_1y6m(5)

X.to_csv("../data/X_1.5_years|2.5_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|2.5_years_exhaustive.csv")

X, y = generate_data_1y6m(6)

X.to_csv("../data/X_1.5_years|3_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|3_years_exhaustive.csv")

X, y = generate_data_1y6m(7)

X.to_csv("../data/X_1.5_years|3.5_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|3.5_years_exhaustive.csv")

X, y = generate_data_1y6m(8)

X.to_csv("../data/X_1.5_years|4_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|4_years_exhaustive.csv")


X, y = generate_data_1y6m(9)

X.to_csv("../data/X_1.5_years|4.5_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|4.5_years_exhaustive.csv")

X, y = generate_data_1y6m(10)

X.to_csv("../data/X_1.5_years|5_years_exhaustive.csv")
y.to_csv("../data/y_1.5_years|5_years_exhaustive.csv")

(508, 4070) (508, 35)
First time sliceof X ends at  EDSS_0
First y column is EDSS_18
Last column for X is  EDSS_216
Last y column is  EDSS_222


### 2y|All

In [290]:
def generate_data_2y(sc):
    
    sc2 = sc - 1
    sc3 = sc + 2

    nanY = updatedDf[select_columns(['EDSS'], n)].copy()
    interpolateY = updatedDf[select_columns(['EDSS'], n)].copy()
    interpolateY.interpolate(axis = 1, limit_direction = "both", inplace = True)
    interpolateX = interpolateY.drop(list(interpolateY.columns)[-sc:],axis = 1).copy()

    X = updatedDf.iloc[:,:(n_inputs_pure*(n-sc2))].copy()
    y = nanY.drop(select_columns(['EDSS'], sc3), axis = 1).copy()
    print(X.shape, y.shape) 

    for col in interpolateX.columns:
        X[col] = interpolateX[col] 

    X.replace('O', 0, inplace = True)
    print("First time sliceof X ends at ", X.columns.tolist()[109])
    print("First y column is", y.columns.tolist()[0])

    print("Last column for X is ",X.columns.tolist()[-1])
    print("Last y column is ", y.columns.tolist()[-1])
    
    return X, y


In [291]:
X, y = generate_data_2y(1)

X.to_csv("../data/X_2_years|6_months_exhaustive.csv")
y.to_csv("../data/y_2_years|6_months_exhaustive.csv")

(508, 4070) (508, 34)
First time sliceof X ends at  EDSS_0
First y column is EDSS_24
Last column for X is  EDSS_216
Last y column is  EDSS_222


In [292]:
X, y = generate_data_2y(2)

X.to_csv("../data/X_2_years|1_year_exhaustive.csv")
y.to_csv("../data/y_2_years|1_year_exhaustive.csv")

(508, 3960) (508, 33)
First time sliceof X ends at  EDSS_0
First y column is EDSS_30
Last column for X is  EDSS_210
Last y column is  EDSS_222


In [293]:
X, y = generate_data_2y(3)

X.to_csv("../data/X_2_years|1.5_years_exhaustive.csv")
y.to_csv("../data/y_2_years|1.5_years_exhaustive.csv")

(508, 3850) (508, 32)
First time sliceof X ends at  EDSS_0
First y column is EDSS_36
Last column for X is  EDSS_204
Last y column is  EDSS_222


In [294]:
X, y = generate_data_2y(4)

X.to_csv("../data/X_2_years|2_years_exhaustive.csv")
y.to_csv("../data/y_2_years|2_years_exhaustive.csv")

(508, 3740) (508, 31)
First time sliceof X ends at  EDSS_0
First y column is EDSS_42
Last column for X is  EDSS_198
Last y column is  EDSS_222


In [295]:
X, y = generate_data_2y(5)

X.to_csv("../data/X_2_years|2.5_years_exhaustive.csv")
y.to_csv("../data/y_2_years|2.5_years_exhaustive.csv")

(508, 3630) (508, 30)
First time sliceof X ends at  EDSS_0
First y column is EDSS_48
Last column for X is  EDSS_192
Last y column is  EDSS_222


In [296]:
X, y = generate_data_2y(6)

X.to_csv("../data/X_2_years|3_years_exhaustive.csv")
y.to_csv("../data/y_2_years|3_years_exhaustive.csv")

(508, 3520) (508, 29)
First time sliceof X ends at  EDSS_0
First y column is EDSS_54
Last column for X is  EDSS_186
Last y column is  EDSS_222


In [297]:
X, y = generate_data_2y(7)

X.to_csv("../data/X_2_years|3.5_years_exhaustive.csv")
y.to_csv("../data/y_2_years|3.5_years_exhaustive.csv")

(508, 3410) (508, 28)
First time sliceof X ends at  EDSS_0
First y column is EDSS_60
Last column for X is  EDSS_180
Last y column is  EDSS_222


In [298]:
X, y = generate_data_2y(8)

X.to_csv("../data/X_2_years|4_years_exhaustive.csv")
y.to_csv("../data/y_2_years|4_years_exhaustive.csv")

(508, 3300) (508, 27)
First time sliceof X ends at  EDSS_0
First y column is EDSS_66
Last column for X is  EDSS_174
Last y column is  EDSS_222


In [299]:
X, y = generate_data_2y(9)

X.to_csv("../data/X_2_years|4.5_years_exhaustive.csv")
y.to_csv("../data/y_2_years|4.5_years_exhaustive.csv")

(508, 3190) (508, 26)
First time sliceof X ends at  EDSS_0
First y column is EDSS_72
Last column for X is  EDSS_168
Last y column is  EDSS_222


In [300]:
X, y = generate_data_2y(10)

X.to_csv("../data/X_2_years|5_years_exhaustive.csv")
y.to_csv("../data/y_2_years|5_years_exhaustive.csv")

(508, 3080) (508, 25)
First time sliceof X ends at  EDSS_0
First y column is EDSS_78
Last column for X is  EDSS_162
Last y column is  EDSS_222
