Classify if Demented, Nondemented

In [None]:
#set up a ColumnTransformer with StandardScaler for numerical features and OneHotEncoder for categorical features.
#set up and training a LinearRegression model using scikit-learn, including data preprocessing steps within a Pipeline.
#implement polynomial regression
#perform hyperparameter tuning for a polynomial regression model
#evaluate the performance of a regression model on test data
#use OneHotEncoder with handle_unknown='ignore' within a preprocessing pipeline to handle unseen categories during model training and evaluation
#set up and execute cross_val_score or GridSearchCV to perform cross-validation

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [10]:
# Load dataset
df = pd.read_csv('dementia.csv')

# View columns
print(df.columns)


Index(['Subject ID', 'MRI ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand',
       'Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'],
      dtype='object')


In [11]:
# Define which columns are numerical and which are categorical
numerical_features = ['Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']
categorical_features = ['M/F', 'Hand']

# Pipelines
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),    # Fill missing numbers with mean
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill missing categories with 'missing'
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])


In [12]:
# Features and target
X = df.drop(columns=['Subject ID', 'MRI ID', 'MR Delay', 'Group'])
y = df['Group']

# Encode target labels (Demented -> 1, Nondemented -> 0)
y = y.map({'Demented': 1, 'Nondemented': 0})

# Drop invalid rows where y is NaN
valid_idx = y.dropna().index
X = X.loc[valid_idx]
y = y.loc[valid_idx]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)


In [13]:

# Set up pipeline
classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train the pipeline
classifier.fit(X_train, y_train)




In [14]:
# Pipeline with Polynomial Features + Logistic Regression
poly_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train
poly_model.fit(X_train, y_train)


In [15]:
# Set up pipeline again for Grid Search
poly_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define parameter grid
param_grid = {
    'poly__degree': [1, 2, 3],
    'classifier__C': [0.01, 0.1, 1, 10]
}

# GridSearchCV
grid_search = GridSearchCV(poly_pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)




Best Parameters: {'classifier__C': 10, 'poly__degree': 2}


In [16]:
# Predict
y_pred = classifier.predict(X_test)

# Evaluate
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy Score: 0.8382352941176471

Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.84      0.85        38
         1.0       0.81      0.83      0.82        30

    accuracy                           0.84        68
   macro avg       0.84      0.84      0.84        68
weighted avg       0.84      0.84      0.84        68



In [17]:
# Simulate an unseen category in test data
X_test_copy = X_test.copy()
X_test_copy.iloc[0, X_test_copy.columns.get_loc('Hand')] = 'UnknownHand'

# Predict successfully
y_pred_unseen = classifier.predict(X_test_copy)
print("Prediction successful even with unseen 'Hand' category!")


Prediction successful even with unseen 'Hand' category!


In [18]:
# Cross-validation
cv_scores = cross_val_score(classifier, X, y, cv=5, scoring='accuracy')

print("Cross-validated Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))


Cross-validated Accuracy Scores: [0.91176471 0.8358209  0.80597015 0.80597015 0.85074627]
Mean Accuracy: 0.8420544337137841
