In [2]:
# pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/4e/ba/ce9bd1cd4953336a0e213b29cb80bb11816f2a93de8c99f88ef0b446ad0c/scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import warnings
warnings.filterwarnings("ignore") 

In [2]:
import os
import numpy as np
import polars as pl
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [3]:
data_dir = r"C:\Users\patri\Documents\FYPCapstone\Data"

train_df = pd.read_csv(data_dir + '\\training_dataset.csv')
test_df = pd.read_csv(data_dir + '\\testing_dataset.csv')

In [4]:
def read_sas(fpath, row, max_processors):
    df,_ = pyreadstat.read_file_multiprocessing(pyreadstat.read_xport, fpath, num_processes=max_processors, num_rows = row)
    pldf = pl.from_pandas(df)
    return pldf

In [5]:
def demo_clean(df):
    target_col = ['SEQN', 'DMDEDUC2', 'DMDMARTL']
    df = df.select(target_col)

    replace1 = {77: "refused", 99: "don't know", }
    df = df.with_columns(pl.col(['DMDMARTL']).map_dict(replace1, default=pl.first()))

    replace2 = {7: "refused", 9: "don't know", }
    df = df.with_columns(pl.col(['DMDEDUC2']).map_dict(replace2, default=pl.first()))
 
    df = df.with_columns(pl.col(target_col).cast(pl.Float64, strict=False))
    df = df.drop_nulls()
    
    df = df.with_columns(df[target_col].cast(pl.Int64))

    return df

In [6]:
demo1112 = pl.from_pandas(pd.read_sas(data_dir + "\\DEMO_G.XPT"))
demo1314 = pl.from_pandas(pd.read_sas(data_dir +"\\DEMO_H.XPT"))

In [7]:
demo1 = demo_clean(demo1112)
demo2 = demo_clean(demo1314)
demo_added = demo1.vstack(demo2)
demo_df = demo_added.to_pandas()

In [8]:
train_df = train_df.merge(demo_df, left_on='SEQN', right_on='SEQN', how='left')
test_df = test_df.merge(demo_df, left_on='SEQN', right_on='SEQN', how='left')

In [23]:
train_df.to_csv(data_dir + "\\training_full.csv")
test_df.to_csv(data_dir + "\\testing_full.csv")

In [20]:
# Separate the features (input variables) and the target variables (impaired_1 to impaired_20)
identifier = ['SEQN', 'func_score'] 

X_train = train_df.drop(identifier + [f'impaired_{i}' for i in range(1, 21)], axis=1)
X_test = test_df.drop(identifier + [f'impaired_{i}' for i in range(1, 21)], axis=1)

targets = [f'impaired_{i}' for i in range(1, 4)] 

In [12]:
# Impute missing values using KNN imputation with K=1
imputer = KNNImputer(n_neighbors=1)

X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [21]:
# Iterate over each target variable and train a logistic regression model
for target in targets:
    # Prepare the data for the target variable
    y_train = train_df[target]
    y_test = test_df[target]

    try:
        model = LogisticRegression(penalty=None).fit(X_train, y_train)
        train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
        test_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

        # Print the AUC for the current model
        print(f"Target Variable: {target}")
        print(f"Training AUC: {train_auc}")
        print(f"Testing AUC: {test_auc}")
        print()

    except: 
        print(f"{target} only contains one class.")

impaired_1 only contains one class.
impaired_2 only contains one class.
impaired_3 only contains one class.


In [22]:
X_train

Unnamed: 0,RIDAGEYR,RIAGENDR,PAXMTSM_Mean,PAXMXM_Mean,PAXMYM_Mean,PAXMZM_Mean,PAXLXMM_Mean,PAXLXSDM_Mean,PAXMTSM_Std_across_day,PAXMXM_Std_across_day,...,PAXLXMM_Std_across_day,PAXLXSDM_Std_across_day,Sedentary_Activity (minutes)_Mean,Moderate_Vigorous_Activity (minutes)_Mean,Light_Activity (minutes)_Mean,Sedentary_Activity (minutes)_Std,Moderate_Vigorous_Activity (minutes)_Std,Light_Activity (minutes)_Std,DMDEDUC2,DMDMARTL
0,52,1,11.139787,3.650883,3.653730,3.835158,408.414110,112.197932,2.514280,0.796507,...,169.216643,38.754074,904.069048,353.785714,79.928571,91.527379,120.207410,27.264795,2.0,3.0
1,78,1,5.759264,1.864521,1.707580,2.187171,36.001575,17.743938,0.852718,0.241300,...,12.011628,4.147129,1162.638095,121.428571,53.714286,189.912393,10.485818,27.426959,5.0,2.0
2,64,1,6.598757,2.211864,2.122199,2.264686,12.509308,3.390877,0.895302,0.272250,...,14.625536,3.580811,1133.677381,146.857143,60.642857,197.934447,46.424414,14.817541,3.0,1.0
3,45,1,7.396138,2.618598,2.282958,2.494578,50.977368,21.355084,1.631450,0.542346,...,46.795136,18.919624,1096.854762,186.500000,54.428571,109.702440,58.195790,21.637985,2.0,3.0
4,80,1,5.991603,1.894676,1.753518,2.343409,98.149471,32.159262,0.745077,0.229125,...,74.438832,10.711760,1131.567857,140.857143,65.357143,159.269802,24.272265,17.485028,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2216,72,1,7.959607,2.623801,2.499310,2.836488,82.721158,37.985314,1.072147,0.389442,...,42.445438,21.654876,1079.639286,166.928571,91.214286,145.600461,40.857913,23.050411,5.0,1.0
2217,55,2,7.477389,2.516066,2.208224,2.753100,151.606082,41.453540,1.973554,0.632689,...,82.787053,22.463072,1082.211905,183.857143,71.714286,198.315350,58.144729,15.480787,4.0,3.0
2218,76,2,6.625667,2.184756,2.131138,2.309771,91.959676,31.777124,1.416653,0.472410,...,87.913462,14.692764,1115.497619,175.928571,46.357143,148.354633,63.933839,15.579672,3.0,1.0
2219,71,1,6.955536,2.416025,2.221246,2.318272,21.596539,10.459557,1.033829,0.374363,...,17.043222,5.394858,1131.854762,133.071429,79.071429,164.996272,25.489260,13.299212,4.0,1.0
