In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [5]:
df=pd.read_csv("/content/alzheimers.csv")

In [6]:
df.shape

(2149, 35)

In [7]:
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [8]:
pd.set_option('display.max_columns', None)

In [9]:
df.isnull().sum()

Unnamed: 0,0
PatientID,0
Age,0
Gender,0
Ethnicity,0
EducationLevel,0
BMI,0
Smoking,0
AlcoholConsumption,0
PhysicalActivity,0
DietQuality,0


In [10]:
df.nunique()

Unnamed: 0,0
PatientID,2149
Age,31
Gender,2
Ethnicity,4
EducationLevel,4
BMI,2149
Smoking,2
AlcoholConsumption,2149
PhysicalActivity,2149
DietQuality,2149


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [12]:
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryAlzheimers,CardiovascularDisease,Diabetes,Depression,HeadInjury,Hypertension,SystolicBP,DiastolicBP,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,0,0,1,1,0,0,142,72,242.36684,56.150897,33.682563,162.189143,21.463532,6.518877,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,0,0,0,0,0,0,115,64,231.162595,193.407996,79.028477,294.630909,20.613267,7.118696,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,1,0,0,0,0,0,99,116,284.181858,153.322762,69.772292,83.638324,7.356249,5.895077,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,0,0,0,0,0,0,118,115,159.58224,65.366637,68.457491,277.577358,13.991127,8.965106,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,0,0,0,0,0,0,94,117,237.602184,92.8697,56.874305,291.19878,13.517609,6.045039,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [13]:
df = df.drop(columns=["DoctorInCharge"])

In [14]:
df['Diagnosis'].value_counts()

Unnamed: 0_level_0,count
Diagnosis,Unnamed: 1_level_1
0,1389
1,760


In [15]:
#doing label encoding
object_class=df.select_dtypes(include='object').columns

encoders={}
le=LabelEncoder()
for i in object_class:
  df[i]=le.fit_transform(df[i])
  encoders[i]=le

In [16]:
# outliers
def outlier_remove(df, col):
    Q1 = df[col].quantile(0.25)  # Calculate Q1 for the specific column
    Q3 = df[col].quantile(0.75)  # Calculate Q3 for the specific column
    IQR = Q3 - Q1

    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)


    median=df[col].median()


    df[col]=df[col].apply(lambda x:median if x<lower_bound or x>upper_bound else x)

    return df

In [17]:
numerical_col=df.select_dtypes(exclude='object').columns

In [18]:
numerical_col

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
       'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP',
       'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis'],
      dtype='object')

In [19]:
df['Diagnosis'].value_counts()

Unnamed: 0_level_0,count
Diagnosis,Unnamed: 1_level_1
0,1389
1,760


In [20]:
x=df.drop('Diagnosis',axis=1)
y=df['Diagnosis']

In [21]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [22]:
smote=SMOTE(random_state=42)
x_new,y_new=smote.fit_resample(x_train,y_train)

In [23]:
models={
    "decison tree":DecisionTreeClassifier(),
    "random forest":RandomForestClassifier(),
    "logistic regression":LogisticRegression(),
    "svm":SVC()
}

In [24]:
cv_scores={}

#perform 5-fold cross validation for each model
for model_name,model in models.items():
  scores=cross_val_score(model,x_new,y_new,cv=8)
  cv_scores[model_name]=scores
  print(f"{model_name} cross validation accuracy:{np.mean(scores):.2f}")
  print("-"*50 )

decison tree cross validation accuracy:0.86
--------------------------------------------------
random forest cross validation accuracy:0.92
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic regression cross validation accuracy:0.71
--------------------------------------------------
svm cross validation accuracy:0.52
--------------------------------------------------


In [25]:
decison_tree=DecisionTreeClassifier(random_state=42)
random_forest=RandomForestClassifier(random_state=42)
logistic_regression=LogisticRegression(random_state=42)
svm=SVC(random_state=42)

In [26]:
#hyperparameter grids for RandomizedSearchCV

param_grid_dt={
    "criterion":["gini","entropy"],
    "max_depth":[None,5,10,15,20,30],
    "min_samples_split":[2,5,10],
}
param_grid_rf={
    "n_estimators":[100,200,300],
    "criterion":["gini","entropy"],
     "max_depth":[None,5,10,15,20,30],
    "min_samples_split":[2,5,10],
}
param_grid_lr={
    "C":[0.001,0.01,0.1,1,10,100],
    "penalty":["l1","l2"],
    "solver":["liblinear"],
}

param_grid_svc={
    "C":[0.001,0.01,0.1,1,10,100],
    "kernel":["linear","rbf","poly"],
}

In [27]:
#hyperparameter tuning for both models
#the below steps can be autyomated by using a for loop or by using a pipeine
#perform randomizedsearchcv for each model

random_search_dt=RandomizedSearchCV(estimator=decison_tree,param_distributions=param_grid_dt,n_iter=10,cv=5,scoring="accuracy",random_state=42)

random_search_rf=RandomizedSearchCV(estimator=random_forest,param_distributions=param_grid_rf,n_iter=10,cv=5,scoring="accuracy",random_state=42)

random_search_lr=RandomizedSearchCV(estimator=logistic_regression,param_distributions=param_grid_lr,n_iter=10,cv=5,scoring="accuracy",random_state=42)

random_search_svc=RandomizedSearchCV(estimator=svm,param_distributions=param_grid_svc,n_iter=10,cv=5,scoring="accuracy",random_state=42)


In [28]:
random_search_dt.fit(x_new,y_new)
random_search_rf.fit(x_new,y_new)
random_search_lr.fit(x_new,y_new)
random_search_svc.fit(x_new,y_new)



In [29]:
print(random_search_dt.best_params_)
print(random_search_rf.best_params_)

{'min_samples_split': 2, 'max_depth': 5, 'criterion': 'entropy'}
{'n_estimators': 300, 'min_samples_split': 5, 'max_depth': 20, 'criterion': 'entropy'}


In [30]:
print(random_search_dt.best_score_)
print(random_search_rf.best_score_)

0.8884593582346392
0.9073651179269155


In [31]:
#get the model with best score

best_model=None
best_score=0

if random_search_dt.best_score_>best_score:
  best_score=random_search_dt.best_score_
  best_model=random_search_dt.best_estimator_

if random_search_rf.best_score_>best_score:
  best_score=random_search_rf.best_score_
  best_model=random_search_rf.best_estimator_

if random_search_svc.best_score_>best_score:
  best_score=random_search_svc.best_score_
  best_model=random_search_svc.best_estimator_

In [32]:
print(best_model)
print(best_score)

RandomForestClassifier(criterion='entropy', max_depth=20, min_samples_split=5,
                       n_estimators=300, random_state=42)
0.9073651179269155


In [33]:
#save the best model
with open("alzheimers_model.sav","wb") as f:
  pickle.dump(best_model,f)

In [34]:
random_forest.fit(x_new,y_new)

In [35]:
#evaluate on test data
y_test_pred=best_model.predict(x_test)
print("accuracy score\n",accuracy_score(y_test,y_test_pred))
print("confusion matrix\n",confusion_matrix(y_test,y_test_pred))
print("classification report\n",classification_report(y_test,y_test_pred))

accuracy score
 0.9046511627906977
confusion matrix
 [[268   9]
 [ 32 121]]
classification report
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       277
           1       0.93      0.79      0.86       153

    accuracy                           0.90       430
   macro avg       0.91      0.88      0.89       430
weighted avg       0.91      0.90      0.90       430



In [39]:
#building predictive model
# Get the feature names used during training
feature_names = x_new.columns

# Ensure input_data has the same features and order as the training data
# Convert input_data to a dictionary if it's not already
if not isinstance(input_data, dict):
    input_data_dict = dict(zip(feature_names, input_data))
else:
    input_data_dict = input_data

# Create input DataFrame with columns matching training data
input_data_df = pd.DataFrame([input_data_dict], columns=feature_names)

prediction = best_model.predict(input_data_df) # Use the DataFrame for prediction
print(prediction)

if (prediction[0]== 0):
  print('The Person do not have alzheimers')
else:
  print('The Person have alzheimers')

[1]
The Person have alzheimers
