In [38]:
# importing important libraries
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics as metrics    

In [3]:
# reading the dataset
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [4]:
# checking the columns present in the dataset.
df.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
       'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP',
       'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis', 'DoctorInCharge'],
      dtype='object')

In [5]:
# checking datatype of all columns of the dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [6]:
# dropping the unnecessary column PatientID.
df.drop('PatientID',axis = 1,inplace=True)

In [7]:
# checking if there is NA data in any attribute of the datset.
df.isna().sum()

Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfulness                0
Diagnosi

In [8]:
# extracting the categorical columns from the dataset.
object_columns = df.select_dtypes(['object']).columns
object_columns

Index(['DoctorInCharge'], dtype='object')

In [9]:
# checking wheather i should hot encode this column or not.as there is only one value in this column so this column need not to be hot encoded.
df.DoctorInCharge.value_counts()

DoctorInCharge
XXXConfid    2149
Name: count, dtype: int64

In [10]:
df.drop(['DoctorInCharge'],axis = 1,inplace=True)

In [11]:
df.columns.value_counts()

Age                          1
DiastolicBP                  1
Forgetfulness                1
DifficultyCompletingTasks    1
PersonalityChanges           1
Disorientation               1
Confusion                    1
ADL                          1
BehavioralProblems           1
MemoryComplaints             1
FunctionalAssessment         1
MMSE                         1
CholesterolTriglycerides     1
CholesterolHDL               1
CholesterolLDL               1
CholesterolTotal             1
SystolicBP                   1
Gender                       1
Hypertension                 1
HeadInjury                   1
Depression                   1
Diabetes                     1
CardiovascularDisease        1
FamilyHistoryAlzheimers      1
SleepQuality                 1
DietQuality                  1
PhysicalActivity             1
AlcoholConsumption           1
Smoking                      1
BMI                          1
EducationLevel               1
Ethnicity                    1
Diagnosi

In [12]:
df.corr()['Diagnosis']

Age                         -0.005488
Gender                      -0.020975
Ethnicity                   -0.014782
EducationLevel              -0.043966
BMI                          0.026343
Smoking                     -0.004865
AlcoholConsumption          -0.007618
PhysicalActivity             0.005945
DietQuality                  0.008506
SleepQuality                -0.056548
FamilyHistoryAlzheimers     -0.032900
CardiovascularDisease        0.031490
Diabetes                    -0.031508
Depression                  -0.005893
HeadInjury                  -0.021411
Hypertension                 0.035080
SystolicBP                  -0.015615
DiastolicBP                  0.005293
CholesterolTotal             0.006394
CholesterolLDL              -0.031976
CholesterolHDL               0.042584
CholesterolTriglycerides     0.022672
MMSE                        -0.237126
FunctionalAssessment        -0.364898
MemoryComplaints             0.306742
BehavioralProblems           0.224350
ADL         

In [15]:
# using chi square for the feature selection.
data = df.copy()
x = data.loc[:,data.columns != 'Diagnosis']
y = data.loc[:,'Diagnosis']
bestFeatures = SelectKBest(score_func=chi2, k='all')
fit = bestFeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
# printing the top8 best features of the dataset.
print(featureScores.nlargest(10,'Score'))

                       Specs       Score
22                      MMSE  607.256596
23      FunctionalAssessment  471.117684
26                       ADL  414.290935
24          MemoryComplaints  160.142705
25        BehavioralProblems   91.203581
21  CholesterolTriglycerides   50.306694
20            CholesterolHDL   35.073008
19            CholesterolLDL   33.219382
9               SleepQuality    3.029711
4                        BMI    2.807631


In [16]:
x = data[featureScores.sort_values(by='Score', ascending=False).iloc[:8,0]]
x

Unnamed: 0,MMSE,FunctionalAssessment,ADL,MemoryComplaints,BehavioralProblems,CholesterolTriglycerides,CholesterolHDL,CholesterolLDL
0,21.463532,6.518877,1.725883,0,0,162.189143,33.682563,56.150897
1,20.613267,7.118696,2.592424,0,0,294.630909,79.028477,193.407996
2,7.356249,5.895077,7.119548,0,0,83.638324,69.772292,153.322762
3,13.991127,8.965106,6.481226,0,1,277.577358,68.457491,65.366637
4,13.517609,6.045039,0.014691,0,0,291.198780,56.874305,92.869700
...,...,...,...,...,...,...,...,...
2144,1.201190,0.238667,4.492838,0,0,234.520123,60.943092,94.870490
2145,6.458060,8.687480,9.204952,0,1,367.986877,93.649735,95.410700
2146,17.011003,1.972137,5.036334,0,0,294.802338,99.678209,156.267294
2147,4.030491,5.173891,3.785399,0,0,145.253746,81.281111,52.482961


In [23]:
# df.corr()

In [25]:
y = data['Diagnosis']
y

0       0
1       0
2       0
3       0
4       0
       ..
2144    1
2145    1
2146    1
2147    1
2148    0
Name: Diagnosis, Length: 2149, dtype: int64

In [31]:
sc = StandardScaler()

In [32]:
x  = sc.fit_transform(x)
# x

array([[ 0.77903679,  0.49750588, -1.10443449, ..., -0.64819945,
        -1.11442916, -1.57266058],
       [ 0.68029675,  0.70490696, -0.81060109, ...,  0.65072056,
         0.84573019,  1.59311897],
       [-0.85922158,  0.28181278,  0.72449145, ..., -1.41858505,
         0.44561479,  0.66856925],
       ...,
       [ 0.26197104, -1.07463444,  0.01809901, ...,  0.65240184,
         1.73835228,  0.73648368],
       [-1.24543699,  0.03244596, -0.40607792, ..., -0.81429302,
         0.94310436, -1.65726005],
       [-0.42274909,  0.42443239,  1.13411473, ..., -0.10675073,
         0.97072337, -0.74120109]])

In [40]:
skf = StratifiedKFold(n_splits=5)
model = LinearRegression()


In [41]:
accuracy = 0
for train_index,test_index in skf.split(x,y):
    xTrain = x[train_index]
    xTest = x[test_index]
    yTrain = y[train_index]
    yTest = y[test_index]
    model.fit(xTrain,yTrain)
    yPred = model.predict(xTest)
    testAccuracy = metrics.accuracy_score(yTest,yPred)
    accuracy += testAccuracy
print((accuracy/5)*100)

ValueError: Classification metrics can't handle a mix of binary and continuous targets