# Machine Learning Assignment
## Depression classification model

Import all the modules and libraries

In [34]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import pickle as pk
import xgboost

The code below is to train data using GPU instead of CPU

In [35]:
physical_devices=tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0],True)

Read data in the csv file

In [36]:
data = pd.read_csv("dataset.csv")

Pre-process some data to remove unused data, convert data type and 

In [37]:
# data.dropna(inplace=True)
y = data['Label']
data.drop('Label', axis=1, inplace=True)

Reformat the type of all the data to float32

In [38]:
print(data.shape)
print(list(data.columns))

(320, 28)
['Jantina (0:Perempuan. 1:Lelaki)', 'Bangsa (0:Melayu, 1:Cina, 2:India, 3:Lain-lain)', 'Umur', 'Status perkahwinan (0:Bujang, 1:Berkahwin, 2:Bercerai, 3:Kematian pasangan)', 'Kategori Jumlah anak (0:0, 1:1-2, 2:3-4, 3:5+)', 'Tahap pendidikan (0:Sekolah Menengah, 1:Diploma atau Sijil, 2:Ijazah Sarjana Muda, 3:Ijazah Sarjana atau Doktor Falsafah)', 'Status kerja (0:Sepenuh masa atau tetap, 1:Separuh masa, 2:Sambung belajar, 3:Suri rumah, 4:Tidak bekerja)', 'Tahap kesihatan', '1 Kesedihan', '2 Pesimis', '3 Kegagalan Lalu', '4 Hilang Kepuasan', '5 Rasa Bersalah', '6 Rasa Dihukum', '7 Tidak suka diri sendiri', '8 Kritik diri sendiri', '9 Fikir untuk bunuh diri', '10 Menangis', '11 Sakit hati', '12 Hilang minat', '13 Sukar buat keputusan', '14 Tak berguna', '15 Hilang tenaga', '16 Perubahan tidur', '17 Terganggu', '18 Perubahan selera', '19 Masalah berat badan', '20 Risau keadaan fizikal']


In [39]:
column_names = data.columns

for name in column_names:
    data[name] =data[name].astype('float32')

Split the data into training set and testing set

In [40]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=20, shuffle=True)

Start to train the data

In [41]:
classifier = xgboost.XGBClassifier(tree_method='gpu_hist')

params = {
        "learning_rate" :[0.05,0.10,0.15,0.20,0.25,0.30],
        "max_depth": [2,3,4,5,6,7,8,10,12,15],
        "min_child_weight" : [1,3,5,7],
        "gamma": [0.0,0.1,0.2,0.3,0.4],
        "colsample_bytree" : [0.3,0.4,0.5,0.7]}

clf = RandomizedSearchCV(classifier, param_distributions = params, n_iter=5, scoring = 'accuracy' , cv =5 , verbose=3  )

clf.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.25, max_depth=3, min_child_weight=3;, score=0.712 total time=   0.8s
[CV 2/5] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.25, max_depth=3, min_child_weight=3;, score=0.686 total time=   0.7s
[CV 3/5] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.25, max_depth=3, min_child_weight=3;, score=0.843 total time=   0.7s
[CV 4/5] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.25, max_depth=3, min_child_weight=3;, score=0.686 total time=   0.6s
[CV 5/5] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.25, max_depth=3, min_child_weight=3;, score=0.784 total time=   0.5s
[CV 1/5] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.2, max_depth=6, min_child_weight=7;, score=0.712 total time=   0.5s
[CV 2/5] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.2, max_depth=6, min_child_weight=7;, score=0.686 total time=   0.4s
[CV 3/5] END colsample_by

Predict accuracy and print out

In [42]:
final_model = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.2, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.15, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=7,
              monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0)

final_model.fit(X_train,y_train)

pred_xgboost = final_model.predict(X_test)


In [43]:
print(classification_report(y_test,final_model.predict(X_test)))
accuracy = final_model.predict(X_test)
metrics.accuracy_score(accuracy, y_test)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94        24
           1       0.88      0.78      0.82        18
           2       0.50      0.71      0.59         7
           3       0.92      0.80      0.86        15

    accuracy                           0.84        64
   macro avg       0.80      0.81      0.80        64
weighted avg       0.86      0.84      0.85        64



0.84375

Export the model into the sav file

In [44]:
filename = "depression_model.sav"
pk.dump(final_model, open(filename, 'wb'))