In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

In [2]:
dataset = pd.read_csv('dataset/alzheimer.csv')
dataset.head()

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Nondemented,M,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,Nondemented,M,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,Demented,M,75,12,,23.0,0.5,1678,0.736,1.046
3,Demented,M,76,12,,28.0,0.5,1738,0.713,1.01
4,Demented,M,80,12,,22.0,0.5,1698,0.701,1.034


In [3]:
# y - dependent variable
# x - independent variables
y = dataset['Group']
X = dataset.iloc[:,1:]

In [4]:
X.isnull().sum()

M/F      0
Age      0
EDUC     0
SES     19
MMSE     2
CDR      0
eTIV     0
nWBV     0
ASF      0
dtype: int64

In [5]:
SESMedian = X['SES'].median()
SESMedian

2.0

In [6]:
X['SES'].fillna(SESMedian, inplace=True)

In [7]:
X.isnull().sum()

M/F     0
Age     0
EDUC    0
SES     0
MMSE    2
CDR     0
eTIV    0
nWBV    0
ASF     0
dtype: int64

In [8]:
MMSEMedian = X['MMSE'].median()
MMSEMedian

29.0

In [9]:
X['MMSE'].fillna(MMSEMedian, inplace=True)

In [10]:
X.isnull().sum()

M/F     0
Age     0
EDUC    0
SES     0
MMSE    0
CDR     0
eTIV    0
nWBV    0
ASF     0
dtype: int64

In [11]:
labelEncoder = LabelEncoder()
X['M/F'] = labelEncoder.fit_transform(X['M/F'])
X.head()

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,1,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,1,75,12,2.0,23.0,0.5,1678,0.736,1.046
3,1,76,12,2.0,28.0,0.5,1738,0.713,1.01
4,1,80,12,2.0,22.0,0.5,1698,0.701,1.034


In [12]:
standardScaler = StandardScaler()
standardizedData = standardScaler.fit(X.iloc[:,1:9])
standardizedData = standardScaler.transform(X.iloc[:,1:9])
standardizedData

array([[ 1.30873772, -0.20813199, -0.3944662 , ...,  2.83605891,
        -0.90516867, -2.2657425 ],
       [ 1.43978716, -0.20813199, -0.3944662 , ...,  2.93270307,
        -1.30964265, -2.31650145],
       [-0.26385558, -0.90439416, -0.3944662 , ...,  1.07940909,
         0.17342861, -1.08378418],
       ...,
       [-2.09854775, -0.55626308, -0.3944662 , ..., -0.96148827,
         1.92614919,  0.98283006],
       [-1.83644887, -0.55626308, -0.3944662 , ..., -0.91600866,
         1.79132453,  0.92481984],
       [-1.57434999, -0.55626308, -0.3944662 , ..., -0.88189896,
         1.92614919,  0.88131217]])

In [13]:
X = pd.concat([X, pd.DataFrame(standardizedData,columns=['Age_N', 'EDUC_N', 'SES_N', 'MMSE_N', 'CDR_N', 'eTIV_N', 'nWBV_N', 'ASF_N'])], axis = 1)
X.drop(columns=['Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'], inplace=True)
X.head()

Unnamed: 0,M/F,Age_N,EDUC_N,SES_N,MMSE_N,CDR_N,eTIV_N,nWBV_N,ASF_N
0,1,1.308738,-0.208132,-0.394466,-0.095686,-0.777653,2.836059,-0.905169,-2.265742
1,1,1.439787,-0.208132,-0.394466,0.721664,-0.777653,2.932703,-1.309643,-2.316501
2,1,-0.263856,-0.904394,-0.394466,-1.185486,0.55905,1.079409,0.173429,-1.083784
3,1,-0.132806,-0.904394,-0.394466,0.176764,0.55905,1.420506,-0.446765,-1.34483
4,1,0.391392,-0.904394,-0.394466,-1.457936,0.55905,1.193108,-0.770344,-1.1708


In [14]:
# y - dependent variable
# x - independent variables
X_training, X_test, y_training, y_test = train_test_split(X,y, test_size=0.4, random_state=1)

In [15]:
NaiveBayes_model = GaussianNB()
NaiveBayes_model.fit(X_training, y_training)

In [16]:
NaiveBayes_prediction = NaiveBayes_model.predict(X_test)

In [17]:
#Naive bayes model metrics
NaiveBayesAccuracy = accuracy_score(y_test, NaiveBayes_prediction)
NaiveBayesPrecision = precision_score(y_test, NaiveBayes_prediction, average = 'weighted')
NaiveBayesRecall = recall_score(y_test, NaiveBayes_prediction, average = 'weighted')
NaiveBayesF1 = f1_score(y_test, NaiveBayes_prediction,  average = 'weighted')
print(f'Naive bayes Accuracy: {NaiveBayesAccuracy}, Naive Bayes Precision: {NaiveBayesPrecision}, Naive Bayes Recall: {NaiveBayesRecall}, Naive Bayes F1: {NaiveBayesF1}')

Naive bayes Accuracy: 0.9133333333333333, Naive Bayes Precision: 0.9211008939974458, Naive Bayes Recall: 0.9133333333333333, Naive Bayes F1: 0.8822311108363472


In [18]:
report = classification_report(y_test, NaiveBayes_prediction)
print(report)

              precision    recall  f1-score   support

   Converted       1.00      0.13      0.24        15
    Demented       0.90      1.00      0.95        52
 Nondemented       0.92      1.00      0.96        83

    accuracy                           0.91       150
   macro avg       0.94      0.71      0.71       150
weighted avg       0.92      0.91      0.88       150



In [19]:
modelName = 'C:/Users/mathe/Desktop/Workspace/Projetos/Java/alzheimer-prediction/model/compiled_model/NaiveBayesModel.pkl'

In [20]:
joblib.dump(NaiveBayes_model, modelName)

['C:/Users/mathe/Desktop/Workspace/Projetos/Java/alzheimer-prediction/model/compiled_model/NaiveBayesModel.pkl']

In [21]:
joblib.dump(standardScaler, 'C:/Users/mathe/Desktop/Workspace/Projetos/Java/alzheimer-prediction/model/compiled_model/StandardizedData.pkl')

['C:/Users/mathe/Desktop/Workspace/Projetos/Java/alzheimer-prediction/model/compiled_model/StandardizedData.pkl']