In [1]:
import warnings
warnings.filterwarnings("ignore") 
import os
import numpy as np
import polars as pl
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
dir = r"A:\fyp\data\\"

train_df = pl.read_csv(dir+"training_dataset.csv")
test_df =pl.read_csv(dir+"testing_dataset.csv")
demo1112 = pl.from_pandas(pd.read_sas(dir + "\\DEMO_G.XPT"))
demo1314 = pl.from_pandas(pd.read_sas(dir +"\\DEMO_H.XPT"))

In [22]:
def demo_clean(df):
    target_col = ['SEQN', 'DMDEDUC2', 'DMDMARTL']
    df = df.select(target_col)

    replace1 = {77: np.nan, 99: np.nan}
    df = df.with_columns(pl.col(['DMDMARTL']).map_dict(replace1, default=pl.first()))

    replace2 = {7: np.nan, 9: np.nan}
    df = df.with_columns(pl.col(['DMDEDUC2']).map_dict(replace2, default=pl.first()))
    return df

In [48]:
demo1 = demo_clean(demo1112)
demo2 = demo_clean(demo1314)
demo_df = demo1.vstack(demo2)

imputer = KNNImputer(n_neighbors=1)
demo_final = pl.DataFrame(imputer.fit_transform(demo_df), schema = demo_df.columns)
demo_final = demo_final.with_columns(demo_final[demo_final.columns].cast(pl.Int64))

In [50]:
train_df = train_df.join(demo_final, on="SEQN", how="left")
test_df = test_df.join(demo_final, on="SEQN", how="left")

In [73]:
def preprocess(df, name = 'func'):
    impaired_list = [item for item in train_df.columns if item.startswith('impaired')]
    feature_df = df.drop(['SEQN', 'func_score'] + impaired_list)
    if name == 'func':
        target_df = df.select(pl.col('func_score'))
    else:
        target_df = df.select(pl.col(impaired_list))
    return feature_df, target_df

In [79]:
x_train, y_train = preprocess(train_df, 'imp')
x_test, y_test = preprocess(test_df, 'imp')

In [96]:
targets = [f'impaired_{i}' for i in range(1, 4)]

for target in targets:
    # Prepare the data for the target variable
    train_y = y_train[target]
    test_y = y_test[target]

    model = LogisticRegression(penalty=None).fit(x_train, train_y)
    train_auc = roc_auc_score(train_y, model.predict_proba(x_train)[:, 1])
    test_auc = roc_auc_score(test_y, model.predict_proba(x_test)[:, 1])

    # Print the AUC for the current model
    print(f"Target Variable: {target}")
    print(f"Training AUC: {round(train_auc, 4)}")
    print(f"Testing AUC: {round(test_auc, 4)}")
    print()

Target Variable: impaired_1
Training AUC: 0.6101
Testing AUC: 0.6125

Target Variable: impaired_2
Training AUC: 0.6563
Testing AUC: 0.6791

Target Variable: impaired_3
Training AUC: 0.7954
Testing AUC: 0.6781

