<a href="https://colab.research.google.com/github/mahnoormirjat11/alpha-Thalassemia-disease-prediction-/blob/main/training_train_model_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# ============================================
#   ALPHA-THALASSEMIA TRAINING PIPELINE
# ============================================

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")

# --- Load CSV
file_path = "/content/alphanorm.csv"
df = pd.read_csv(file_path)

print("Dataset loaded:", file_path)
print("Shape:", df.shape)
print(df.head())

# --- Target inference
candidate_cols = []
for col in df.columns:
    if col.lower() in ["id","index","patient_id"]:
        continue
    uniq = df[col].nunique()
    if 2 <= uniq <= 10:
        candidate_cols.append(col)

preferred = ["target","label","thal","alpha","class","result","diagnosis"]
target = None
for p in preferred:
    for c in df.columns:
        if p in c.lower():
            target = c
            break
    if target:
        break

if not target:
    target = candidate_cols[0]

print("Detected target column:", target)
print(df[target].value_counts())

# --- Split
X = df.drop(columns=[target])
y = df[target]

# Encode labels if categorical
if y.dtype == object:
    y_encoded, uniques = pd.factorize(y)
    y = pd.Series(y_encoded)
    label_map = dict(enumerate(uniques))
    print("Label mapping:", label_map)
else:
    label_map = None

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Identify numeric & categorical columns
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

# Preprocessor
numeric = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric, num_cols),
    ("cat", categorical, cat_cols)
])

# ============================================
#       RANDOM FOREST MODEL
# ============================================
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_pipeline.fit(X_train, y_train)

# Evaluation
pred = rf_pipeline.predict(X_test)
print("\nRandomForest Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))

# Save RF model
os.makedirs("/content/artifacts", exist_ok=True)
joblib.dump(rf_pipeline, "/content/artifacts/alpha_rf.pkl")
print("\nSaved RandomForest â†’ /content/artifacts/alpha_rf.pkl")



# Save metadata
meta = {
    "target": target,
    "label_map": label_map,
    "numeric_columns": num_cols,
    "categorical_columns": cat_cols
}
joblib.dump(meta, "/content/artifacts/meta.pkl")
print("Metadata saved â†’ /content/artifacts/meta.pkl")

print("\nðŸŽ‰ Training complete!")

Dataset loaded: /content/alphanorm.csv
Shape: (203, 16)
      sex    hb   pcv   rbc   mcv   mch  mchc   rdw   wbc  neut  lymph    plt  \
0  female  10.8  35.2  5.12  68.7  21.2  30.8  13.4   9.6  53.0   33.0  309.0   
1    male  10.8  26.6  4.28  62.1  25.3  40.8  19.8  10.3  49.4   43.1  687.0   
2  female  10.8  35.2  5.12  68.7  21.2  30.8  13.4   9.6  53.0   33.0  309.0   
3    male  14.5  43.5  5.17  84.0  28.0  33.4  12.1  11.9  31.0   50.0  334.0   
4    male  11.5  34.4  5.02  68.7  22.9  33.4  15.7  20.4  67.0   30.0  596.0   

    hba  hba2   hbf      phenotype  
0  88.5   2.6  0.11  alpha carrier  
1  87.8   2.4  0.90  alpha carrier  
2  88.5   2.6  0.10  alpha carrier  
3  86.8   2.8  0.30  alpha carrier  
4  86.3   2.4  1.30  alpha carrier  
Detected target column: sex
sex
male      112
female     91
Name: count, dtype: int64
Label mapping: {0: 'female', 1: 'male'}

RandomForest Accuracy: 0.7317073170731707

Classification Report:
               precision    recall  f1-sco

In [8]:
from google.colab import files
files.download("/content/artifacts/alpha_rf.pkl")

files.download("/content/artifacts/meta.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>