In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

In [3]:
#import saas file:
data_path = "data.sas7bdat"
df = pd.read_sas(data_path, format='sas7bdat', encoding='utf-8')

# decode all object columns that are bytes to strings
for col in df.select_dtypes(include=['object']).columns:
    if df[col].apply(lambda x: isinstance(x, bytes)).any():
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

## Data Exploration and cleaning 

In [4]:
df.head()

Unnamed: 0,randhosp_id,randpat_id,pretrialexp,country,trialphase,phase,itt_treat,age,gender,deathcode,...,censor18,surv18,plan18,UKextra,disab_unknown6,vital_and_disabunknown6,disab_unknown18,vital_and_disabunknown18,treatment,haltcode
0,90,1,2.0,UK,Open,2.0,0.0,81.0,1.0,E4,...,0.0,158.0,1.0,2.0,0.0,0.0,0.0,0.0,rt-PA,
1,12,2,1.0,SWEDEN,Open,2.0,1.0,92.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,Placebo,
2,43,3,1.0,POLAND,Open,2.0,1.0,75.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,Placebo,
3,77,4,2.0,UK,Open,2.0,0.0,60.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,rt-PA,I63
4,43,5,1.0,POLAND,Open,2.0,0.0,88.0,1.0,,...,,,2.0,2.0,0.0,0.0,,,rt-PA,


In [5]:
print("Number of rows:", df.shape[0])
print("Number of features:", df.shape[1])

Number of rows: 3035
Number of features: 266


In [6]:
df['deathcode'].value_counts()

deathcode
E1    536
E4    307
E3    194
E8     79
E9     77
E2     47
E7     21
Name: count, dtype: int64

In [7]:
# Keep only the rows where deathcode is NaN, e1 (cerebrovascular) or e3 (cardiovascular)
df = df[df['deathcode'].isna() | (df['deathcode'] == 'E1') | (df['deathcode'] == 'E3')]
df['deathcode'] = (~df['deathcode'].isna()).astype(bool)

Choose the rows with no randomisation violation

In [8]:
df['randvioltype'].value_counts()

randvioltype
Not independent in ADL                 15
Pre-randomisation low dose heparin     15
Haemorrhage on R scan                   1
Advanced ischaemic change on R scan     1
Name: count, dtype: int64

In [9]:
df = df[df['randvioltype'].isna()]

Examine other missing values:

In [10]:
cols_many_missing = df.columns[df.isnull().mean() > 0.5]

print("Columns with more than 50% missing values:", len(cols_many_missing.tolist()))
print("\n")
for col in df.columns:
    if col in cols_many_missing:
        print(f"{col}: {df[col].isnull().sum()} missing values")

Columns with more than 50% missing values: 37


deathdate_unknown: 1711 missing values
randvioltype: 2472 missing values
haem_type7: 2472 missing values
nonstroke_type7: 2431 missing values
final_status: 1805 missing values
euroqol18: 1244 missing values
yrfu_code: 1974 missing values
waiver_code: 2443 missing values
extracranial_bleed_site: 2452 missing values
other_effect_code: 2363 missing values
nostartcode: 2438 missing values
event_days: 2224 missing values
hypodeg: 1499 missing values
hypodegsite: 1499 missing values
mca: 1499 missing values
affmca: 1572 missing values
aspcau: 1572 missing values
asplen: 1572 missing values
aspins: 1572 missing values
aspint: 1572 missing values
aspm1: 1572 missing values
aspm2: 1572 missing values
aspm3: 1572 missing values
aspm4: 1572 missing values
aspm5: 1572 missing values
aspm6: 1572 missing values
oial: 1499 missing values
aca: 2101 missing values
pca: 2101 missing values
subinf: 2101 missing values
cbzinf: 2101 missing values
cinf: 2101 

## Define action, context, and reward:

### Action

In [11]:
action_cols = ['treatment']
actions = df[action_cols]

#check na
print("Number of missing values in action 'treatment':", actions.isnull().sum().item())

Number of missing values in action 'treatment': 0


### Reward

In [12]:
# columns relating to death
reward_cols = ['ohs6']
rewards = df[reward_cols]

#check na
print("Number of missing values in reward:", rewards.isnull().sum().item())

Number of missing values in reward: 0


### Context:

In [13]:
initial_info_columns = [
  "randhosp_id",
  "randpat_id",
  "pretrialexp",
  "country",
  "trialphase",
  "phase",
  "itt_treat",
  "age",
  "gender",
  "randyear",
  "randmonth",
  "randhour",
  "randmin",
  "randdelay"
]

rand_columns = [
  "livealone_rand",
  "indepinadl_rand",
  "infarct",
  "antiplat_rand",
  "atrialfib_rand",
  "sbprand",
  "dbprand",
  "weight",
  "glucose",
  "gcs_eye_rand",
  "gcs_motor_rand",
  "gcs_verbal_rand",
  "gcs_score_rand",
  "nihss",
  "liftarms_rand",
  "ablewalk_rand",
  "weakface_rand",
  "weakarm_rand",
  "weakleg_rand",
  "dysphasia_rand",
  "hemianopia_rand",
  "visuospat_rand",
  "brainstemsigns_rand",
  "otherdeficit_rand",
  "stroketype",
  "pred_nihss",
  "konprob",
  # "randvioltype"
]

context_columns = initial_info_columns + rand_columns

In [14]:
context = df[context_columns]

#check na in context
missing_context = context.isnull().sum()
missing_context_cols = missing_context[missing_context > 0].index.tolist()

print("Number of missing values in context features:")
print(missing_context[missing_context > 0])

Number of missing values in context features:
dbprand     17
glucose    211
dtype: int64


In [15]:
# impute missing context values
imputer = SimpleImputer(strategy='mean')
context_imputed = context.copy()
context_imputed[missing_context_cols] = imputer.fit_transform(context[missing_context_cols])
context_imputed.isnull().sum()

randhosp_id            0
randpat_id             0
pretrialexp            0
country                0
trialphase             0
phase                  0
itt_treat              0
age                    0
gender                 0
randyear               0
randmonth              0
randhour               0
randmin                0
randdelay              0
livealone_rand         0
indepinadl_rand        0
infarct                0
antiplat_rand          0
atrialfib_rand         0
sbprand                0
dbprand                0
weight                 0
glucose                0
gcs_eye_rand           0
gcs_motor_rand         0
gcs_verbal_rand        0
gcs_score_rand         0
nihss                  0
liftarms_rand          0
ablewalk_rand          0
weakface_rand          0
weakarm_rand           0
weakleg_rand           0
dysphasia_rand         0
hemianopia_rand        0
visuospat_rand         0
brainstemsigns_rand    0
otherdeficit_rand      0
stroketype             0
pred_nihss             0


Export reward, action, context to csv:

In [18]:
df_recombined = pd.concat([actions, rewards, context], axis=1)

In [20]:
#export to csv
df_recombined.to_csv("action_reward_context_combined_processed.csv", index=False)