In [1]:
## import liberies

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
df=pd.read_csv('dementia_dataset.csv')

In [4]:
df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [5]:
# dropping unwanted columns
df.drop(['Subject ID','MRI ID'],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Group     373 non-null    object 
 1   Visit     373 non-null    int64  
 2   MR Delay  373 non-null    int64  
 3   M/F       373 non-null    object 
 4   Hand      373 non-null    object 
 5   Age       373 non-null    int64  
 6   EDUC      373 non-null    int64  
 7   SES       354 non-null    float64
 8   MMSE      371 non-null    float64
 9   CDR       373 non-null    float64
 10  eTIV      373 non-null    int64  
 11  nWBV      373 non-null    float64
 12  ASF       373 non-null    float64
dtypes: float64(5), int64(5), object(3)
memory usage: 38.0+ KB


In [8]:
## check null values count
df.isnull().sum()

Group        0
Visit        0
MR Delay     0
M/F          0
Hand         0
Age          0
EDUC         0
SES         19
MMSE         2
CDR          0
eTIV         0
nWBV         0
ASF          0
dtype: int64

In [9]:
## fill missing values
df['SES']=df['SES'].fillna(df['SES'].mean())
df['MMSE']=df['MMSE'].fillna(df['MMSE'].mean())

In [10]:
df.head()

Unnamed: 0,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,Demented,1,0,M,R,75,12,2.460452,23.0,0.5,1678,0.736,1.046
3,Demented,2,560,M,R,76,12,2.460452,28.0,0.5,1738,0.713,1.01
4,Demented,3,1895,M,R,80,12,2.460452,22.0,0.5,1698,0.701,1.034


In [11]:
l=LabelEncoder()
df['Group']=l.fit_transform(df['Group'])
df['M/F']=l.fit_transform(df['M/F'])
df['Hand']=l.fit_transform(df['Hand'])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Group     373 non-null    int32  
 1   Visit     373 non-null    int64  
 2   MR Delay  373 non-null    int64  
 3   M/F       373 non-null    int32  
 4   Hand      373 non-null    int32  
 5   Age       373 non-null    int64  
 6   EDUC      373 non-null    int64  
 7   SES       373 non-null    float64
 8   MMSE      373 non-null    float64
 9   CDR       373 non-null    float64
 10  eTIV      373 non-null    int64  
 11  nWBV      373 non-null    float64
 12  ASF       373 non-null    float64
dtypes: float64(5), int32(3), int64(5)
memory usage: 33.6 KB


In [13]:
x=df.iloc[:,1:]
y=df.iloc[:,0]

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

# create prediction model

In [16]:
d=DecisionTreeClassifier()

In [17]:
d.fit(x_train,y_train)

DecisionTreeClassifier()

In [18]:
p=d.predict(x_test)

In [20]:
# create confusion matrix for cheking prediction output
print(confusion_matrix(y_test,p))

[[ 1  4  2]
 [ 1 29  0]
 [ 4  0 34]]


In [21]:
# create classification report for checking accuracy
print(classification_report(y_test,p))

              precision    recall  f1-score   support

           0       0.17      0.14      0.15         7
           1       0.88      0.97      0.92        30
           2       0.94      0.89      0.92        38

    accuracy                           0.85        75
   macro avg       0.66      0.67      0.66        75
weighted avg       0.85      0.85      0.85        75



In [22]:
# check accuracy score or prediction score
print(accuracy_score(y_test,p))

0.8533333333333334
