<a href="https://colab.research.google.com/github/mollah2022/Neural-Network/blob/main/CSV_Disorder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [91]:
# ===============================================
# ✅ XGBoost for Genetic Disorder Prediction
# ===============================================

!pip install -q xgboost scikit-learn imbalanced-learn category_encoders

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import category_encoders as ce
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# === File Paths ===
TRAIN_PATH = "/content/drive/MyDrive/Colab Notebooks/datasetDisorder/train.csv"
TEST_PATH  = "/content/drive/MyDrive/Colab Notebooks/datasetDisorder/test.csv"

# === Load Dataset ===
df = pd.read_csv(TRAIN_PATH)
TARGET = "Genetic Disorder"
y = df[TARGET].astype(str)

# === Drop unnecessary columns ===
drop_cols = [
    "Patient Id", "Patient First Name", "Family Name", "Father's name",
    "Institute Name", "Location of Institute"
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')

X = df.drop(columns=[TARGET])

# === Identify numeric & categorical columns ===
num_cols = [c for c in X.columns if X[c].dtype != 'object' and X[c].nunique() > 2]
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or X[c].nunique() <= 20]

# === Impute missing values ===
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

# === Encode categorical columns ===
encoder = ce.TargetEncoder(cols=cat_cols, smoothing=0.3)
X[cat_cols] = encoder.fit_transform(X[cat_cols], y)

# === Scale numeric columns ===
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# === Encode target labels ===
le = LabelEncoder()
y_enc = le.fit_transform(y)

# === Balance dataset with SMOTE ===
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y_enc)
print("After SMOTE:", X_res.shape, "Class counts:", np.bincount(y_res))

# === Split Train/Validation ===
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# === Build XGBoost Model ===
model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_res)),
    learning_rate=0.05,
    n_estimators=500,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# === Train Model ===
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=50)

# === Evaluate on Validation Set ===
val_preds = model.predict(X_val)
print("\n✅ Validation Accuracy:", accuracy_score(y_val, val_preds))
print("\nClassification Report (Validation):\n", classification_report(y_val, val_preds, target_names=le.classes_))


After SMOTE: (40808, 38) Class counts: [10202 10202 10202 10202]
[0]	validation_0-mlogloss:1.33634
[50]	validation_0-mlogloss:0.50365
[100]	validation_0-mlogloss:0.39204
[150]	validation_0-mlogloss:0.36159
[200]	validation_0-mlogloss:0.34722
[250]	validation_0-mlogloss:0.33654
[300]	validation_0-mlogloss:0.32711
[350]	validation_0-mlogloss:0.31875
[400]	validation_0-mlogloss:0.31191
[450]	validation_0-mlogloss:0.30541
[499]	validation_0-mlogloss:0.30107

✅ Validation Accuracy: 0.9016172506738545

Classification Report (Validation):
                                               precision    recall  f1-score   support

 Mitochondrial genetic inheritance disorders       0.83      1.00      0.91      2043
Multifactorial genetic inheritance disorders       0.96      0.98      0.97      2072
            Single-gene inheritance diseases       0.91      0.88      0.89      2047
                                         nan       0.92      0.75      0.83      2000

                             

In [93]:
# ===============================================
# ✅ Simple ANN Model for Genetic Disorder Prediction
# ===============================================

!pip install -q tensorflow scikit-learn imbalanced-learn category_encoders

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import category_encoders as ce
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# === File Paths ===
TRAIN_PATH = "/content/drive/MyDrive/Colab Notebooks/datasetDisorder/train.csv"
TEST_PATH  = "/content/drive/MyDrive/Colab Notebooks/datasetDisorder/test.csv"

# === Load Dataset ===
df = pd.read_csv(TRAIN_PATH)
TARGET = "Genetic Disorder"
y = df[TARGET].astype(str)

# === Drop unnecessary columns ===
drop_cols = [
    "Patient Id", "Patient First Name", "Family Name", "Father's name",
    "Institute Name", "Location of Institute"
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')

X = df.drop(columns=[TARGET])

# === Identify numeric & categorical columns ===
num_cols = [c for c in X.columns if X[c].dtype != 'object' and X[c].nunique() > 2]
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or X[c].nunique() <= 20]

# === Impute missing values ===
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

# === Encode categorical columns ===
encoder = ce.TargetEncoder(cols=cat_cols, smoothing=0.3)
X[cat_cols] = encoder.fit_transform(X[cat_cols], y)

# === Scale numeric columns ===
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# === Encode target labels ===
le = LabelEncoder()
y_enc = le.fit_transform(y)
y_cat = to_categorical(y_enc)  # One-hot encoding for ANN

# === Balance dataset with SMOTE ===
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y_cat)
print("After SMOTE:", X_res.shape, "Class counts:", np.sum(y_res, axis=0))

# === Split Train/Validation ===
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# === Build ANN Model ===
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# === Train ANN ===
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=64, verbose=2)

# === Evaluate on Validation Set ===
val_preds = model.predict(X_val)
val_preds_labels = np.argmax(val_preds, axis=1)
y_val_labels = np.argmax(y_val, axis=1)

print("\n✅ Validation Accuracy:", accuracy_score(y_val_labels, val_preds_labels))
print("\nClassification Report (Validation):\n", classification_report(y_val_labels, val_preds_labels, target_names=le.classes_))


After SMOTE: (40808, 38) Class counts: [10202 10202 10202 10202]


Epoch 1/30
511/511 - 3s - 7ms/step - accuracy: 0.6142 - loss: 0.9261 - val_accuracy: 0.7208 - val_loss: 0.6536
Epoch 2/30
511/511 - 2s - 3ms/step - accuracy: 0.6999 - loss: 0.6933 - val_accuracy: 0.7193 - val_loss: 0.6418
Epoch 3/30
511/511 - 2s - 4ms/step - accuracy: 0.7061 - loss: 0.6769 - val_accuracy: 0.7214 - val_loss: 0.6366
Epoch 4/30
511/511 - 3s - 6ms/step - accuracy: 0.7079 - loss: 0.6705 - val_accuracy: 0.7218 - val_loss: 0.6310
Epoch 5/30
511/511 - 2s - 4ms/step - accuracy: 0.7067 - loss: 0.6671 - val_accuracy: 0.7209 - val_loss: 0.6286
Epoch 6/30
511/511 - 2s - 4ms/step - accuracy: 0.7076 - loss: 0.6633 - val_accuracy: 0.7120 - val_loss: 0.6336
Epoch 7/30
511/511 - 2s - 4ms/step - accuracy: 0.7092 - loss: 0.6591 - val_accuracy: 0.7215 - val_loss: 0.6221
Epoch 8/30
511/511 - 2s - 4ms/step - accuracy: 0.7084 - loss: 0.6582 - val_accuracy: 0.7224 - val_loss: 0.6266
Epoch 9/30
511/511 - 2s - 4ms/step - accuracy: 0.7088 - loss: 0.6552 - val_accuracy: 0.7200 - val_loss: 0.6267
E

In [95]:
# ===============================================
# ✅ ML Models Comparison for Genetic Disorder Prediction
# Using: Logistic Regression, Random Forest, XGBoost
# ===============================================

!pip install -q xgboost scikit-learn imbalanced-learn category_encoders

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# === File Paths ===
TRAIN_PATH = "/content/drive/MyDrive/Colab Notebooks/datasetDisorder/train.csv"

# === Load Dataset ===
df = pd.read_csv(TRAIN_PATH)
TARGET = "Genetic Disorder"
y = df[TARGET].astype(str)

# === Drop unnecessary columns ===
drop_cols = [
    "Patient Id", "Patient First Name", "Family Name", "Father's name",
    "Institute Name", "Location of Institute"
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')
X = df.drop(columns=[TARGET])

# === Identify numeric & categorical columns ===
num_cols = [c for c in X.columns if X[c].dtype != 'object' and X[c].nunique() > 2]
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or X[c].nunique() <= 20]

# === Impute missing values ===
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

# === Encode categorical columns ===
encoder = ce.TargetEncoder(cols=cat_cols, smoothing=0.3)
X[cat_cols] = encoder.fit_transform(X[cat_cols], y)

# === Scale numeric columns ===
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# === Encode target labels ===
le = LabelEncoder()
y_enc = le.fit_transform(y)

# === Balance dataset with SMOTE ===
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y_enc)
print("After SMOTE:", X_res.shape, "Class counts:", np.bincount(y_res))

# === Split Train/Validation ===
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# === Models Dictionary ===
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'),
    "Random Forest": RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42),
    "XGBoost": XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_res)), learning_rate=0.05, n_estimators=500, max_depth=6, eval_metric='mlogloss', use_label_encoder=False, random_state=42)
}

# === Train & Evaluate All Models ===
for name, model in models.items():
    print(f"\n=== Training {name} ===")
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    print(f"✅ Validation Accuracy: {acc:.4f}")
    print(f"Classification Report (Validation):\n{classification_report(y_val, preds, target_names=le.classes_)}")


After SMOTE: (40808, 38) Class counts: [10202 10202 10202 10202]

=== Training Logistic Regression ===
✅ Validation Accuracy: 0.6774
Classification Report (Validation):
                                              precision    recall  f1-score   support

 Mitochondrial genetic inheritance disorders       0.60      0.92      0.72      2043
Multifactorial genetic inheritance disorders       0.92      0.79      0.85      2072
            Single-gene inheritance diseases       0.74      0.91      0.82      2047
                                         nan       0.22      0.08      0.12      2000

                                    accuracy                           0.68      8162
                                   macro avg       0.62      0.67      0.63      8162
                                weighted avg       0.62      0.68      0.63      8162


=== Training Random Forest ===
✅ Validation Accuracy: 0.8714
Classification Report (Validation):
                                          