https://www.census.gov/programs-surveys/nsch/data/datasets.2020.html#list-tab-491554181

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
filename = ''
data = pd.read_csv(filename)

In [None]:
# Pre-processing
imputer = SimpleImputer(strategy='mean')
data_final = pd.DataFrame(imputer.fit_transform(data_final), columns = data_final.columns)

In [None]:
# Combine diagnosis and severity into a new target variable ranging from 0 to 3
data['ADD_diagnosis_binary'] = data['ADD_diagnosis'].apply(lambda x: 0 if x == 2 else 1) #assign 0 to 'No ADD' instead of 2
data['ADD_SEVERITY'] = data['K2Q31A'] * data['K2Q31C']
target_variable = 'ADD_SEVERITY'

In [None]:
# Identify features with high correlation with the target
correlation_matrix = data.corr()

# Plot heatmap for correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Combine diagnosis and severity into a new variable ranging from 0 to 3
data['ADD_SEVERITY'] = (data['K2Q31A']-1) * data['K2Q31C']
target_variable = 'ADD_SEVERITY'
target_correlation = correlation_matrix[target_variable].sort_values(ascending=False)
highly_correlated_features = target_correlation[abs(target_correlation) > 0.2].index.tolist()

print("Features with high correlation with the target:")
print(highly_correlated_features)

In [None]:
# Select relevant features and target variables
# all demographic and screening variables
screen_features = ['AGEPOS4','C_AGE_YEARS','C_CSHCN','C_ENGLISH','C_HISPANIC_R','C_K2Q10','C_K2Q11','C_K2Q12',
                   'C_K2Q13','C_K2Q14','C_K2Q15','C_K2Q16','C_K2Q17','C_K2Q18','C_K2Q19','C_K2Q20','C_K2Q21',
                   'C_K2Q22','C_K2Q23', 'C_RACE_R','C_SEX','HHLANGUAGE','RACER', "TOTNONSHCN",
                   "SC_AGE_LT10", "SC_AGE_LT4", "SC_AGE_LT6", "SC_AGE_LT9", "SC_AGE_YEARS", "SC_AIAN",
                   "SC_ASIAN", "SC_CSHCN", "SC_ENGLISH", "SC_HISPANIC_R","SC_K2Q10", "SC_K2Q11", "SC_K2Q12", 
                   "SC_K2Q13", "SC_K2Q14", "SC_K2Q15", "SC_K2Q16", "SC_K2Q17", "SC_K2Q18", "SC_K2Q19","SC_K2Q20", 
                   "SC_K2Q21", "SC_K2Q22", "SC_K2Q23", "SC_NHPI", "SC_RACER", "SC_RACE_R", "SC_SEX", "TENURE", 
                   "TOTAGE_0_5", "TOTAGE_12_17", "TOTAGE_6_11", "TOTCSHCN", "TOTFEMALE", "TOTKIDS_R", "TOTMALE"
additional_features = ['CONCUSSION', 'DISTRACTED', 'ENGAGE_PICKY', 'K10Q12', 'ACE6', 'HIGRADE', 'FPL_I1', 'HOMEEVIC']

# Filter the dataset for all selected features and target variable
data_final = data[screen_features + additional_features + [target_variable]].copy()

X = data_final[selected_features]
y = data_final[target_variable]

# Split the dataset into train, validation, and test sets
train_size = 0.5
validation_size = 0.4
test_size = 0.1

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1 - train_size), random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size/(test_size + validation_size), random_state = 42)


In [None]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define and train models
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Gaussian Process Classifier': GaussianProcessClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

In [None]:
# Evaluate each model
for name, model in models.items():

    accuracy = accuracy_score(y_val, y_pred)
    print(f'{name} - Validation Accuracy: {accuracy:.2f}')
    print(classification_report(y_val, y_pred))
    print(confusion_matrix(y_val, y_pred))

    # Feature importance for Random Forest
    if name == 'Random Forest':
        feature_importance = model.feature_importances_
        plt.barh(selected_features, feature_importance)
        plt.xlabel('Feature Importance')
        plt.title('Random Forest - Feature Importance')
        plt.show()

# Choose the best model and evaluate on the test set
best_model = ''  # Change this based on the best-performing model from validation
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

# Evaluate the best model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Best Model - Test Accuracy: {accuracy_test:.2f}')
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))