# Autoencoder train/test split

## Setup & read in data 

In [49]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import os

In [7]:
data_df = pd.read_csv('../../data/processed/combined_processed_data.csv')

In [8]:
data_df.head()

Unnamed: 0,Rndrng_NPI,Rndrng_Prvdr_Gndr,Drug_Tot_HCPCS_Cds,Drug_Tot_Benes,Drug_Tot_Srvcs,Drug_Sbmtd_Chrg,Drug_Mdcr_Alowd_Amt,Drug_Mdcr_Pymt_Amt,Drug_Mdcr_Stdzd_Amt,Med_Tot_HCPCS_Cds,...,Rndrng_Prvdr_Type_Speech Language Pathologist,Rndrng_Prvdr_Type_Sports Medicine,Rndrng_Prvdr_Type_Surgical Oncology,Rndrng_Prvdr_Type_Thoracic Surgery,Rndrng_Prvdr_Type_Undefined Physician type,Rndrng_Prvdr_Type_Undersea and Hyperbaric Medicine,Rndrng_Prvdr_Type_Unknown Supplier/Provider Specialty,Rndrng_Prvdr_Type_Urology,Rndrng_Prvdr_Type_Vascular Surgery,excluded
0,1003000126,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1003000134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1003000407,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1003000480,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1003000522,1.0,9.0,144.5,167.5,38892.565,13229.245,13197.3,13262.99,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Separate fraudulent vs. non-fraudulent data 

Note. The training data is normal only, and the test data is a mix of normal and fraudulent. 

In [14]:
data_df_fraud = data_df.loc[data_df["excluded"] == 1]
data_df_nonfraud = data_df.loc[data_df["excluded"] == 0]

In [17]:
data_train, data_test_norm = train_test_split(data_df_nonfraud, test_size = 0.01, random_state =1) 

In [19]:
data_train.shape

(1068369, 108)

In [21]:
data_test_norm.shape

(10792, 108)

In [25]:
data_df_fraud.shape

(330, 108)

In [29]:
# concatenate normal & abnormal test cases 
data_test = pd.concat([data_test_norm, data_df_fraud], axis=0)
data_test = shuffle(data_test, random_state=1)

In [30]:
data_test.shape 

(11122, 108)

## Split X & y 

In [38]:
y_test = data_test['excluded'].tolist()
X_test = data_test.drop(['excluded'], axis=1).to_numpy()

In [39]:
sum(y_test)/len(y_test) # proportion of positive cases 

0.029670922495953964

In [35]:
y_train = data_train["excluded"].tolist()
X_train = data_train.drop(["excluded"], axis=1).to_numpy()

## Scale X values 

In [43]:
stdsc = StandardScaler()

X_train = stdsc.fit_transform(X_train) 
X_test = stdsc.transform(X_test) 

In [45]:
X_train.shape 

(1068369, 107)

In [46]:
X_test.shape 

(11122, 107)

## Save data 

In [47]:
np.savez_compressed('../../data/processed/train', a=X_train, b=y_train)

In [48]:
np.savez_compressed('../../data/processed/test', a=X_test, b=y_test)