In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

df = pd.read_excel('schizophrenia.xlsx')

# Dropping columns with >40% missing values
df = df.dropna(thresh=len(df) * 0.6, axis=1)

In [2]:

# Fill missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('UNKNOWN')
    else:
        df[col] = df[col].fillna(df[col].median())

In [3]:
df["CLASS"] = df["CLASS"].str.strip().str.upper()
df["CLASS"] = df["CLASS"].replace({
    "SHIZ": "SCHIZ",
    "SHICZ": "SCHIZ"
})

In [4]:
# dropping the year table,since it isnt a time-series analysis
df = df.drop(columns=['YEAR'])

# dropping 'DIAGN' if it's redundant
if 'DIAGN' in df.columns:
    df = df.drop(columns=['DIAGN'])


In [5]:
# Label encoder
label_encoders = {}

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str)  # Convert all entries to string
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

In [6]:
# Split into input and output
X = df.drop(columns=['CLASS'])
y = df['CLASS']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [9]:
joblib.dump({
    "model": model,
    "encoders": label_encoders  # this is your dictionary of LabelEncoders
}, "schizo_model.pkl")

['schizo_model.pkl']

In [10]:
y_pred = model.predict(X_test)

In [11]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9172932330827067

Confusion Matrix:
 [[26 10]
 [ 1 96]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.72      0.83        36
           1       0.91      0.99      0.95        97

    accuracy                           0.92       133
   macro avg       0.93      0.86      0.89       133
weighted avg       0.92      0.92      0.91       133

