# Data Merge, Split for Roberta

In [1]:
pip install pickle5



In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
import pickle5 as pickle
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def lower_col_names(df):
    '''
    lowercase the column names of a pd df
    Input: pd dataframe
    Output: lower - a pd dataframe with lowercase columns 
    '''
    df.columns= df.columns.str.strip().str.lower()
    
    return df

In [5]:
data_dir = "/content/drive/MyDrive/NLP Winter 2022 N2C2/"

In [6]:
best_note_pickle = open (data_dir + 'best_note_df.pkl', "rb")
labels_pickle = open (data_dir + 'labels_final_df.pkl', "rb")

best_note = pickle.load(best_note_pickle)
best_note.drop(['TEXT'], inplace = True, axis=1)

labels = pickle.load(labels_pickle)
labels = lower_col_names(labels)

In [7]:
non_text = pd.read_csv(data_dir + 'non_text_features.csv')
non_text = lower_col_names(non_text) 

In [8]:
roberta = sparse.load_npz(data_dir + 'sparse_roberta.npz').todense()
bn_roberta_rl_los = best_note.copy(deep=True)
bn_roberta_rl_los['roberta'] = roberta.tolist()
bn_roberta_rl_los = lower_col_names(bn_roberta_rl_los)
bn_roberta_rl_los = bn_roberta_rl_los.merge(labels, on = ['hadm_id', 'subject_id'])
bn_roberta_rl_los = bn_roberta_rl_los.merge(non_text, on = ['hadm_id', 'subject_id'])
bn_roberta_rl_los = lower_col_names(bn_roberta_rl_los)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(bn_roberta_rl_los.drop(['readmit', 'stay_length_sec'], axis = 1), 
                                                    bn_roberta_rl_los[['readmit', 'stay_length_sec']], 
                                                    test_size=0.2, 
                                                    random_state=1)

x_train, x_val, y_train, y_val = train_test_split(x_train, 
                                                  y_train, 
                                                  test_size=0.25, 
                                                  random_state=1)

In [10]:
x_train.to_pickle(data_dir + "roberta_los_read_x_train.pkl")
y_train.to_pickle(data_dir + "roberta_los_read_y_train.pkl")
x_val.to_pickle(data_dir + "roberta_los_read_x_val.pkl")
y_val.to_pickle(data_dir + "roberta_los_read_y_val.pkl")
x_test.to_pickle(data_dir + "roberta_los_read_x_test.pkl")
y_test.to_pickle(data_dir + "roberta_los_read_y_test.pkl")

In [11]:
x_train, x_test, y_train, y_test = train_test_split(bn_roberta_rl_los.drop(['readmit', 'stay_length_sec'], axis = 1), 
                                                    bn_roberta_rl_los[['readmit']], 
                                                    test_size=0.2, 
                                                    random_state=1)

x_train, x_val, y_train, y_val = train_test_split(x_train, 
                                                  y_train, 
                                                  test_size=0.25, 
                                                  random_state=1)

In [12]:
x_train.to_pickle(data_dir + "roberta_read_x_train.pkl")
y_train.to_pickle(data_dir + "roberta_read_y_train.pkl")
x_val.to_pickle(data_dir + "roberta_read_x_val.pkl")
y_val.to_pickle(data_dir + "roberta_read_y_val.pkl")
x_test.to_pickle(data_dir + "roberta_read_x_test.pkl")
y_test.to_pickle(data_dir + "roberta_read_y_test.pkl")