In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

In [2]:
#import saas file:
data_path = "data.sas7bdat"
df = pd.read_sas(data_path, format='sas7bdat', encoding='utf-8')

# decode all object columns that are bytes to strings
for col in df.select_dtypes(include=['object']).columns:
    if df[col].apply(lambda x: isinstance(x, bytes)).any():
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

## Data Exploration and cleaning 

In [3]:
df.head()

Unnamed: 0,randhosp_id,randpat_id,pretrialexp,country,trialphase,phase,itt_treat,age,gender,deathcode,...,censor18,surv18,plan18,UKextra,disab_unknown6,vital_and_disabunknown6,disab_unknown18,vital_and_disabunknown18,treatment,haltcode
0,90,1,2.0,UK,Open,2.0,0.0,81.0,1.0,E4,...,0.0,158.0,1.0,2.0,0.0,0.0,0.0,0.0,rt-PA,
1,12,2,1.0,SWEDEN,Open,2.0,1.0,92.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,Placebo,
2,43,3,1.0,POLAND,Open,2.0,1.0,75.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,Placebo,
3,77,4,2.0,UK,Open,2.0,0.0,60.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,rt-PA,I63
4,43,5,1.0,POLAND,Open,2.0,0.0,88.0,1.0,,...,,,2.0,2.0,0.0,0.0,,,rt-PA,


In [4]:
print("Number of rows:", df.shape[0])
print("Number of features:", df.shape[1])

Number of rows: 3035
Number of features: 266


In [5]:
df['deathcode'].value_counts()

deathcode
E1    536
E4    307
E3    194
E8     79
E9     77
E2     47
E7     21
Name: count, dtype: int64

In [6]:
# Keep only the rows where deathcode is NaN, e1 (cerebrovascular) or e3 (cardiovascular)
df = df[df['deathcode'].isna() | (df['deathcode'] == 'E1') | (df['deathcode'] == 'E3')]
df['deathcode'] = (~df['deathcode'].isna()).astype(bool)

Choose the rows with no randomisation violation

In [7]:
df['randvioltype'].value_counts()

randvioltype
Not independent in ADL                 15
Pre-randomisation low dose heparin     15
Haemorrhage on R scan                   1
Advanced ischaemic change on R scan     1
Name: count, dtype: int64

In [8]:
df = df[df['randvioltype'].isna()]

Examine other missing values:

In [9]:
cols_many_missing = df.columns[df.isnull().mean() > 0.5]

print("Columns with more than 50% missing values:", len(cols_many_missing.tolist()))
print("\n")
for col in df.columns:
    if col in cols_many_missing:
        print(f"{col}: {df[col].isnull().sum()} missing values")

Columns with more than 50% missing values: 37


deathdate_unknown: 1711 missing values
randvioltype: 2472 missing values
haem_type7: 2472 missing values
nonstroke_type7: 2431 missing values
final_status: 1805 missing values
euroqol18: 1244 missing values
yrfu_code: 1974 missing values
waiver_code: 2443 missing values
extracranial_bleed_site: 2452 missing values
other_effect_code: 2363 missing values
nostartcode: 2438 missing values
event_days: 2224 missing values
hypodeg: 1499 missing values
hypodegsite: 1499 missing values
mca: 1499 missing values
affmca: 1572 missing values
aspcau: 1572 missing values
asplen: 1572 missing values
aspins: 1572 missing values
aspint: 1572 missing values
aspm1: 1572 missing values
aspm2: 1572 missing values
aspm3: 1572 missing values
aspm4: 1572 missing values
aspm5: 1572 missing values
aspm6: 1572 missing values
oial: 1499 missing values
aca: 2101 missing values
pca: 2101 missing values
subinf: 2101 missing values
cbzinf: 2101 missing values
cinf: 2101 

## Define action, context, and reward:

### Action

In [35]:
action_cols = ['itt_treat']
actions = df[action_cols].copy()

print("itt_treat:", actions.head())

actions['itt_treat'] = ~actions['itt_treat'].astype(bool)
actions = actions.rename(columns={'itt_treat': 'treatment'})

actions['treatment'] = actions['treatment'].astype(int)
print("Number of missing values in action 'treatment':", actions.isnull().sum().item())

print("treat:", actions.head())

itt_treat:    itt_treat
1        1.0
2        1.0
3        0.0
4        0.0
5        0.0
Number of missing values in action 'treatment': 0
treat:    treatment
1          0
2          0
3          1
4          1
5          1


### Reward

In [36]:
# columns relating to death
reward_cols = ['ohs6']
rewards = df[reward_cols]

#check na
print("Number of missing values in reward:", rewards.isnull().sum().item())

Number of missing values in reward: 0


### Context:

In [37]:
initial_info_columns = [
  # "randhosp_id",
  # "randpat_id",
  "pretrialexp",
  "country",
  "trialphase",
  "phase",
  # "itt_treat",   ### removed since it's the action
  "age",
  "gender",
  # "randyear",
  # "randmonth",
  # "randhour",
  # "randmin",
  "randdelay"
]

rand_columns = [
  "livealone_rand",
  "indepinadl_rand",
  "infarct",
  "antiplat_rand",
  "atrialfib_rand",
  "sbprand",
  "dbprand",
  "weight",
  "glucose",
  "gcs_eye_rand",
  "gcs_motor_rand",
  "gcs_verbal_rand",
  "gcs_score_rand",
  "nihss",
  "liftarms_rand",
  "ablewalk_rand",
  "weakface_rand",
  "weakarm_rand",
  "weakleg_rand",
  "dysphasia_rand",
  "hemianopia_rand",
  "visuospat_rand",
  "brainstemsigns_rand",
  "otherdeficit_rand",
  "stroketype",
  "pred_nihss",
  "konprob",
  # "randvioltype"
]

context_columns = initial_info_columns + rand_columns

In [38]:
context = df[context_columns]

#check na in context
missing_context = context.isnull().sum()
missing_context_cols = missing_context[missing_context > 0].index.tolist()

print("Number of missing values in context features:")
print(missing_context[missing_context > 0])

Number of missing values in context features:
dbprand     17
glucose    211
dtype: int64


In [39]:
# impute missing context values
imputer = SimpleImputer(strategy='mean')
context_imputed = context.copy()
context_imputed[missing_context_cols] = imputer.fit_transform(context[missing_context_cols])
context_imputed.isnull().sum()

pretrialexp            0
country                0
trialphase             0
phase                  0
age                    0
gender                 0
randdelay              0
livealone_rand         0
indepinadl_rand        0
infarct                0
antiplat_rand          0
atrialfib_rand         0
sbprand                0
dbprand                0
weight                 0
glucose                0
gcs_eye_rand           0
gcs_motor_rand         0
gcs_verbal_rand        0
gcs_score_rand         0
nihss                  0
liftarms_rand          0
ablewalk_rand          0
weakface_rand          0
weakarm_rand           0
weakleg_rand           0
dysphasia_rand         0
hemianopia_rand        0
visuospat_rand         0
brainstemsigns_rand    0
otherdeficit_rand      0
stroketype             0
pred_nihss             0
konprob                0
dtype: int64

## Processing

Recombine reward, action, context for easy processing

In [40]:
df_recombined = pd.concat([actions, rewards, context], axis=1)

In [41]:
from scripts.columns import GENERAL, GENERAL_FORMATS, RAND_FORM, RAND_FORM_FORMATS
from scripts.preprocess import preprocess, min_max_normalize

#select all except ohs6
SELECTED_COLS = df_recombined.columns[(df_recombined.columns != 'ohs6')&(df_recombined.columns != 'treatment')]
OTHER_FORMATS = {}
for col in SELECTED_COLS:
    if col in GENERAL_FORMATS:
        OTHER_FORMATS[col] = GENERAL_FORMATS[col]
    elif col in RAND_FORM_FORMATS:
        OTHER_FORMATS[col] = RAND_FORM_FORMATS[col]
    elif df_recombined[col].dtype in ['int64', 'float64']:
        OTHER_FORMATS[col] = 'float64'
    elif df_recombined[col].dtype == 'bool':
        OTHER_FORMATS[col] = 'bool'
    elif df_recombined[col].dtype == 'object':
        # Check if it's binary Y/N type
        if len(df_recombined[col].dropna().unique()) <= 2:
            OTHER_FORMATS[col] = 'YNDQ'
        else:
            OTHER_FORMATS[col] = 'categorical'
    else:
        OTHER_FORMATS[col] = 'categorical'

FORMATS = OTHER_FORMATS 
FORMATS_FILTERED = {col: FORMATS[col] for col in SELECTED_COLS if col in FORMATS}

# Now run preprocessing with filtered formats
df_proc, stats_df_proc = preprocess(df_recombined, SELECTED_COLS, FORMATS_FILTERED)

df_proc.head()

Unnamed: 0,pretrialexp,phase,age,randdelay,sbprand,dbprand,weight,glucose,gcs_score_rand,nihss,...,visuospat_rand_Unknown,visuospat_rand_Yes,brainstemsigns_rand_Unknown,brainstemsigns_rand_Yes,otherdeficit_rand_Unknown,otherdeficit_rand_Yes,stroketype_Other,stroketype_PACI,stroketype_POCI,stroketype_TACI
1,-1.234669,0.295828,1.246717,-1.656541,0.637048,0.721876,-0.298921,-0.512456,-0.259172,0.920518,...,True,False,False,False,False,False,False,False,False,True
2,-1.234669,0.295828,-0.097911,0.974224,-1.481019,-0.854515,-0.165178,-0.512456,0.741881,-1.098833,...,False,True,False,False,False,False,False,True,False,False
3,0.809606,0.295828,-1.284347,0.360833,-1.142129,-0.58036,-0.432665,-0.918008,0.241354,0.199321,...,False,True,False,False,False,False,False,False,False,True
4,-1.234669,0.295828,0.930334,-1.615648,0.933578,0.51626,-1.168254,-0.106904,-0.259172,-0.810355,...,False,False,False,False,False,False,False,True,False,False
5,0.809606,0.295828,0.613951,-0.402498,1.060662,1.201647,-0.499536,-0.512456,-2.261278,1.208997,...,False,True,False,False,False,False,False,False,False,True


In [42]:
#check na in combined df
missing_combined = df_proc.isnull().sum()
missing_combined_cols = missing_combined[missing_combined > 0].index.tolist()

In [43]:
df_proc_recombined = pd.concat([df_recombined[['ohs6','treatment']], df_proc], axis=1)

In [44]:
#min max scale ohs6:
df_proc_recombined_all_proc = min_max_normalize(df_proc_recombined, 'ohs6')

df_proc_recombined_all_proc.head()

Unnamed: 0,ohs6,treatment,pretrialexp,phase,age,randdelay,sbprand,dbprand,weight,glucose,...,visuospat_rand_Unknown,visuospat_rand_Yes,brainstemsigns_rand_Unknown,brainstemsigns_rand_Yes,otherdeficit_rand_Unknown,otherdeficit_rand_Yes,stroketype_Other,stroketype_PACI,stroketype_POCI,stroketype_TACI
1,0.333333,0,-1.234669,0.295828,1.246717,-1.656541,0.637048,0.721876,-0.298921,-0.512456,...,True,False,False,False,False,False,False,False,False,True
2,0.166667,0,-1.234669,0.295828,-0.097911,0.974224,-1.481019,-0.854515,-0.165178,-0.512456,...,False,True,False,False,False,False,False,True,False,False
3,0.166667,1,0.809606,0.295828,-1.284347,0.360833,-1.142129,-0.58036,-0.432665,-0.918008,...,False,True,False,False,False,False,False,False,False,True
4,0.5,1,-1.234669,0.295828,0.930334,-1.615648,0.933578,0.51626,-1.168254,-0.106904,...,False,False,False,False,False,False,False,True,False,False
5,1.0,1,0.809606,0.295828,0.613951,-0.402498,1.060662,1.201647,-0.499536,-0.512456,...,False,True,False,False,False,False,False,False,False,True


### Export data

In [46]:
#export to csv
df_proc_recombined_all_proc.to_csv("action_reward_context_combined_processed.csv", index=False)