# **This notebook is for building a classification model**

In [2]:
# Call libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Read dataset
df = pd.read_csv("Anxiety_data.csv")
df.head()

Unnamed: 0,age,gender,bmi,who_bmi,phq_score,depression_severity,depressiveness,suicidal,depression_diagnosis,depression_treatment,anxiety_severity,anxiousness,anxiety_diagnosis,anxiety_treatment,epworth_score,sleepiness
0,19,male,33.333333,Class I Obesity,9,Mild,False,False,False,False,Moderate,True,False,False,7.0,False
1,18,male,19.84127,Normal,8,Mild,False,False,False,False,Mild,False,False,False,14.0,True
2,19,male,25.102391,Overweight,8,Mild,False,False,False,False,Mild,False,False,False,6.0,False
3,18,female,23.738662,Normal,19,Moderately severe,True,True,False,False,Severe,True,False,False,11.0,True
4,18,male,25.617284,Overweight,6,Mild,False,False,False,False,Moderate,True,False,False,3.0,False


In [None]:
# How many rows and columns are there?
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1566 entries, 0 to 1565
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1566 non-null   int64  
 1   gender                1566 non-null   object 
 2   bmi                   1566 non-null   float64
 3   who_bmi               1566 non-null   object 
 4   phq_score             1566 non-null   int64  
 5   depression_severity   1558 non-null   object 
 6   depressiveness        1560 non-null   object 
 7   suicidal              1564 non-null   object 
 8   depression_diagnosis  1564 non-null   object 
 9   depression_treatment  1558 non-null   object 
 10  anxiety_severity      1566 non-null   object 
 11  anxiousness           1554 non-null   object 
 12  anxiety_diagnosis     1558 non-null   object 
 13  anxiety_treatment     1562 non-null   object 
 14  epworth_score         1550 non-null   float64
 15  sleepiness           

In [33]:
# Get the dimensions of the Dataset
df.shape

(1566, 16)

In [None]:
# Find out if there are missing values ​​or not
df.isna().any()

age                     False
gender                  False
bmi                     False
who_bmi                 False
phq_score               False
depression_severity      True
depressiveness           True
suicidal                 True
depression_diagnosis     True
depression_treatment     True
anxiety_severity        False
anxiousness              True
anxiety_diagnosis        True
anxiety_treatment        True
epworth_score            True
sleepiness               True
dtype: bool

In [None]:
# Delete rows with missing data
df = df.dropna(axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1530 entries, 0 to 1565
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1530 non-null   int64  
 1   gender                1530 non-null   object 
 2   bmi                   1530 non-null   float64
 3   who_bmi               1530 non-null   object 
 4   phq_score             1530 non-null   int64  
 5   depression_severity   1530 non-null   object 
 6   depressiveness        1530 non-null   object 
 7   suicidal              1530 non-null   object 
 8   depression_diagnosis  1530 non-null   object 
 9   depression_treatment  1530 non-null   object 
 10  anxiety_severity      1530 non-null   object 
 11  anxiousness           1530 non-null   object 
 12  anxiety_diagnosis     1530 non-null   object 
 13  anxiety_treatment     1530 non-null   object 
 14  epworth_score         1530 non-null   float64
 15  sleepiness            1530

In [36]:
# Get the dimensions of the Dataset After deletion
df.shape

(1530, 16)

In [None]:
# Finding object data locatobjections
df.select_dtypes("object").describe().transpose()

Unnamed: 0,count,unique,top,freq
gender,1530,2,female,788
who_bmi,1530,7,Normal,994
depression_severity,1530,6,Mild,678
depressiveness,1530,2,False,1120
suicidal,1530,2,False,1400
depression_diagnosis,1530,2,False,1398
depression_treatment,1530,2,False,1416
anxiety_severity,1530,4,Mild,586
anxiousness,1530,2,False,1150
anxiety_diagnosis,1530,2,False,1408


In [None]:
# Overview of the data after tuning
df.describe()

Unnamed: 0,age,bmi,phq_score,epworth_score
count,1530.0,1530.0,1530.0,1530.0
mean,20.257516,23.410511,7.16732,6.396078
std,1.771908,4.588703,4.418965,3.99689
min,18.0,0.0,0.0,0.0
25%,19.0,20.957274,4.0,4.0
50%,20.0,23.148148,6.0,6.0
75%,21.0,25.510204,9.0,9.0
max,31.0,54.552668,24.0,32.0


In [None]:
df["anxiety_severity"].value_counts()

anxiety_severity
Mild            586
None-minimal    564
Moderate        248
Severe          132
Name: count, dtype: int64

In [None]:
# drop useless columns
df.drop(
    columns=["depression_severity", "suicidal", "epworth_score", "phq_score"],
    inplace=True,
)

In [None]:
label_encoders = {}
for column in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [None]:
# Separate features from target variables
X = df.drop(columns=["anxiety_severity"])
y = df["anxiety_severity"]

In [None]:
# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [44]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [46]:
accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy * 100:.2f}%")

accuracy: 87.91%


In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[112   0  17   0]
 [  0  36   0   0]
 [ 18   0  95   0]
 [  0   2   0  26]]
              precision    recall  f1-score   support

           0       0.86      0.87      0.86       129
           1       0.95      1.00      0.97        36
           2       0.85      0.84      0.84       113
           3       1.00      0.93      0.96        28

    accuracy                           0.88       306
   macro avg       0.91      0.91      0.91       306
weighted avg       0.88      0.88      0.88       306



In [None]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
}

In [49]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
grid_search.fit(X_train, y_train)

In [50]:
print("best parmter:", grid_search.best_params_)
best_model = grid_search.best_estimator_

best parmter: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
# predicting using the best model
y_pred_best = best_model.predict(X_test)

In [52]:
accuracy = accuracy_score(y_test, y_pred_best)
print(f"accuracy: {accuracy * 100:.2f}%")

accuracy: 87.91%


In [53]:
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))

[[112   0  17   0]
 [  0  36   0   0]
 [ 18   0  95   0]
 [  0   2   0  26]]
              precision    recall  f1-score   support

           0       0.86      0.87      0.86       129
           1       0.95      1.00      0.97        36
           2       0.85      0.84      0.84       113
           3       1.00      0.93      0.96        28

    accuracy                           0.88       306
   macro avg       0.91      0.91      0.91       306
weighted avg       0.88      0.88      0.88       306



In [None]:
import pickle


pickle.dump(best_model, open("model.pkl", "wb"))

In [None]:
model = pickle.load(open("model.pkl", "rb"))

In [None]:
df["anxiety_severity"].value_counts()

anxiety_severity
0    586
2    564
1    248
3    132
Name: count, dtype: int64