In [15]:
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#importing data file
df = pd.read_csv(r'C:\Users\nihca\OneDrive\Documents\vscode\edaproject\Final_Augmented_dataset_Diseases_and_Symptoms.csv')
df=df[df['diseases'].notnull()]
df

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246940,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246941,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246942,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246943,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#assigning numerical value for every disease
le=LabelEncoder()
df["diseases_encoded"]=le.fit_transform(df["diseases"])
df["diseases_encoded"]

0         531
1         531
2         531
3         531
4         531
         ... 
246940    506
246941    506
246942    506
246943    506
246944    506
Name: diseases_encoded, Length: 246945, dtype: int32

In [4]:
#applying scaler to normalize numerical value
scaler=StandardScaler()
scaled_data=scaler.fit_transform(df.drop(columns=["diseases","diseases_encoded"]))
scaled_data

array([[ 4.94844929, -0.21131779,  3.25095032, ...,  0.        ,
        -0.00853792,  0.        ],
       [-0.20208351, -0.21131779,  3.25095032, ...,  0.        ,
        -0.00853792,  0.        ],
       [ 4.94844929,  4.73220926,  3.25095032, ...,  0.        ,
        -0.00853792,  0.        ],
       ...,
       [-0.20208351, -0.21131779, -0.30760236, ...,  0.        ,
        -0.00853792,  0.        ],
       [-0.20208351, -0.21131779, -0.30760236, ...,  0.        ,
        -0.00853792,  0.        ],
       [-0.20208351, -0.21131779, -0.30760236, ...,  0.        ,
        -0.00853792,  0.        ]])

In [5]:
X = scaled_data
y = df["diseases_encoded"]

In [6]:
#splitting training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
#applying class weights to train model according to the frequency 
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [8]:
#model definition
rf_model = RandomForestClassifier(
    n_estimators=100,  
    max_depth=60,    
    min_samples_split=2,
    class_weight=class_weight_dict,  
    random_state=42
)
rf_model.fit(X_train, y_train)

In [9]:
y_pred = rf_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8342


In [None]:
#joblib.dump(rf_model, 'random_forest_model.joblib')

['____random_forest_model.joblib']

In [None]:
#joblib.dump(scaler, 'scaler.joblib')
#joblib.dump(le, '____label_encoder.joblib')

['____label_encoder.joblib']