In [72]:
import pandas as pd

In [73]:
df = pd.read_sas('data.sas7bdat')
# decode all object columns that are bytes to strings
for col in df.select_dtypes(include=['object']).columns:
    if df[col].apply(lambda x: isinstance(x, bytes)).any():
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

In [74]:
df.head()

Unnamed: 0,randhosp_id,randpat_id,pretrialexp,country,trialphase,phase,itt_treat,age,gender,deathcode,...,censor18,surv18,plan18,UKextra,disab_unknown6,vital_and_disabunknown6,disab_unknown18,vital_and_disabunknown18,treatment,haltcode
0,90,1,2.0,UK,Open,2.0,0.0,81.0,1.0,E4,...,0.0,158.0,1.0,2.0,0.0,0.0,0.0,0.0,rt-PA,
1,12,2,1.0,SWEDEN,Open,2.0,1.0,92.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,Placebo,
2,43,3,1.0,POLAND,Open,2.0,1.0,75.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,Placebo,
3,77,4,2.0,UK,Open,2.0,0.0,60.0,1.0,,...,1.0,548.0,1.0,2.0,0.0,0.0,0.0,0.0,rt-PA,I63
4,43,5,1.0,POLAND,Open,2.0,0.0,88.0,1.0,,...,,,2.0,2.0,0.0,0.0,,,rt-PA,


In [75]:
df['deathcode'].value_counts()

deathcode
E1    536
E4    307
E3    194
E8     79
E9     77
E2     47
E7     21
Name: count, dtype: int64

In [76]:
# Keep only the rows where deathcode is NaN, e1 (cerebrovascular) or e3 (cardiovascular)
df = df[df['deathcode'].isna() | (df['deathcode'] == 'E1') | (df['deathcode'] == 'E3')]
df['deathcode'] = (~df['deathcode'].isna()).astype(bool)

Choose the rows with no randomisation violation

In [77]:
df['randvioltype'].value_counts()

randvioltype
Not independent in ADL                 15
Pre-randomisation low dose heparin     15
Haemorrhage on R scan                   1
Advanced ischaemic change on R scan     1
Name: count, dtype: int64

In [78]:
df = df[df['randvioltype'].isna()]
len(df)

2472

# Variable explanations

In [79]:
from scripts.columns import GENERAL, GENERAL_FORMATS, RAND_FORM, RAND_FORM_FORMATS
from scripts.preprocess import preprocess

SELECTED_COLS = GENERAL + RAND_FORM + ['deathcode']
FORMATS = GENERAL_FORMATS | RAND_FORM_FORMATS | {'deathcode': bool}

df = df[SELECTED_COLS]

df, stats_df = preprocess(df, SELECTED_COLS, FORMATS )

df.head()

Unnamed: 0,age,itt_treat,weight,glucose,sbprand,dbprand,weight.1,glucose.1,gcs_score_rand,nihss,...,visuospat_rand_Unknown,visuospat_rand_Yes,brainstemsigns_rand_Unknown,brainstemsigns_rand_Yes,otherdeficit_rand_Unknown,otherdeficit_rand_Yes,stroketype_Other,stroketype_PACI,stroketype_POCI,stroketype_TACI
1,1.246717,1.0,-0.298921,-0.512456,0.637048,0.721876,-0.298921,-0.512456,-0.259172,0.920518,...,True,False,False,False,False,False,False,False,False,True
2,-0.097911,1.0,-0.165178,-0.512456,-1.481019,-0.854515,-0.165178,-0.512456,0.741881,-1.098833,...,False,True,False,False,False,False,False,True,False,False
3,-1.284347,0.0,-0.432665,-0.918008,-1.142129,-0.58036,-0.432665,-0.918008,0.241354,0.199321,...,False,True,False,False,False,False,False,False,False,True
4,0.930334,0.0,-1.168254,-0.106904,0.933578,0.51626,-1.168254,-0.106904,-0.259172,-0.810355,...,False,False,False,False,False,False,False,True,False,False
5,0.613951,0.0,-0.499536,-0.512456,1.060662,1.201647,-0.499536,-0.512456,-2.261278,1.208997,...,False,True,False,False,False,False,False,False,False,True


In [80]:
from sklearn.model_selection import train_test_split

df = df.dropna()

# Split the data into train and test sets (e.g., 80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert all boolean columns to float (1.0 or 0.0)
for df_ in [train_df, test_df]:
    bool_cols = df_.select_dtypes(include=['bool']).columns
    df_[bool_cols] = df_[bool_cols].astype(float)


# Optionally, display the shapes of the splits
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (1798, 54)
Test shape: (450, 54)


In [81]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
import numpy as np

# Prepare data for classification
# Let's classify 'deathcode' (already encoded in preprocess) as the target
# Remove rows with missing target
train_df = train_df.dropna(subset=['deathcode'])
test_df = test_df.dropna(subset=['deathcode'])

# Remove any rows with NaN in features (to avoid NaN in input)
X_train = train_df.drop(columns=['deathcode'])
X_test = test_df.drop(columns=['deathcode'])

# Remove columns with any NaN in train or test
nan_cols = X_train.columns[X_train.isna().any() | X_test.isna().any()]
if len(nan_cols) > 0:
    print(f"Removing columns with NaN: {list(nan_cols)}")
    X_train = X_train.drop(columns=nan_cols)
    X_test = X_test.drop(columns=nan_cols)

# Remove any remaining rows with NaN (shouldn't be needed, but just in case)
X_train = X_train.dropna()
X_test = X_test.dropna()

# Align y with X after dropping rows
y_train = train_df.loc[X_train.index, 'deathcode'].astype(int).values
y_test = test_df.loc[X_test.index, 'deathcode'].astype(int).values

X_train = X_train.values
X_test = X_test.values

# Check for inf/-inf in data
def clean_inf(X):
    X = np.where(np.isinf(X), np.nan, X)
    return np.nan_to_num(X, nan=0.0)

X_train = clean_inf(X_train)
X_test = clean_inf(X_test)

# Convert to torch tensors
X_train_tensor = torch.tensor(X_train.astype('float32'), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype('float32'), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define the fully-connected neural network
class FCNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, output_dim=2):
        super(FCNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.net(x)

input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))  # Should be 2 for binary classification

model = FCNN(input_dim, hidden_dim=64, output_dim=output_dim)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Training loop
epochs = 100
batch_size = 64

for epoch in range(epochs):
    model.train()
    permutation = torch.randperm(X_train_tensor.size()[0])
    epoch_loss = 0.0
    batch_count = 0
    for i in range(0, X_train_tensor.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X_train_tensor[indices], y_train_tensor[indices]

        optimizer.zero_grad()
        outputs = model(batch_x)
        # Check for NaN in outputs or batch_y
        if torch.isnan(outputs).any() or torch.isnan(batch_y.float()).any():
            print("NaN detected in outputs or targets!")
            continue
        loss = criterion(outputs, batch_y)
        if torch.isnan(loss):
            print("NaN loss detected! Skipping batch.")
            continue
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        batch_count += 1
    avg_loss = epoch_loss / max(batch_count, 1)
    if (epoch+1) % 5 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Evaluation
model.eval()
with torch.no_grad():
    logits = model(X_test_tensor)
    preds = torch.argmax(logits, dim=1).numpy()
    acc = accuracy_score(y_test, preds)
    print(f"Test Accuracy: {acc:.4f}")


Epoch 1/100, Loss: 0.6404
Epoch 5/100, Loss: 0.4716
Epoch 10/100, Loss: 0.4572
Epoch 15/100, Loss: 0.4492
Epoch 20/100, Loss: 0.4595
Epoch 25/100, Loss: 0.4446
Epoch 30/100, Loss: 0.4430
Epoch 35/100, Loss: 0.4333
Epoch 40/100, Loss: 0.4267
Epoch 45/100, Loss: 0.4234
Epoch 50/100, Loss: 0.4150
Epoch 55/100, Loss: 0.4066
Epoch 60/100, Loss: 0.4077
Epoch 65/100, Loss: 0.4123
Epoch 70/100, Loss: 0.3851
Epoch 75/100, Loss: 0.3813
Epoch 80/100, Loss: 0.3858
Epoch 85/100, Loss: 0.3846
Epoch 90/100, Loss: 0.3736
Epoch 95/100, Loss: 0.3612
Epoch 100/100, Loss: 0.3539
Test Accuracy: 0.7333
