<a href="https://colab.research.google.com/github/meetmehedi/datas/blob/main/Female_revision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
"""
validate_and_generate_figures.py

Run: python validate_and_generate_figures.py
(Or open in Google Colab and run cells; adjust paths accordingly.)

Outputs (in ./validation_results):
 - cv_summary.csv           : table of mean/std/95%CI for Accuracy / F1 / AUC (no-resample & with-resample)
 - confusion_matrices.json  : confusion matrices on holdout for key models
 - perm_<model>.csv         : permutation importance (top features)
 - shap_<model>_summary.png : SHAP summary (if shap available) or permutation bar plot
 - policy_simulation.json   : toy policy simulation findings
 - figures/*.png            : confusion matrices and importance plots
"""

import os, json, math
import pandas as pd, numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

# Optional imports
try:
    import xgboost as xgb; HAS_XGB=True
except: HAS_XGB=False
try:
    import lightgbm as lgb; HAS_LGB=True
except: HAS_LGB=False
try:
    import catboost as cb; HAS_CAT=True
except: HAS_CAT=False
try:
    from imblearn.over_sampling import SMOTE
    HAS_IMBLEARN=True
except:
    HAS_IMBLEARN=False
try:
    import shap; HAS_SHAP=True
except:
    HAS_SHAP=False

# ---------- CONFIG ----------
INPUT_CSV = "/content/dataf n.csv"          # update path if needed
OUT_DIR = "validation_results"
FOLDS = 5                          # 5-fold CV for reasonable runtime
BOOTSTRAP_ITERS = 500              # bootstrap iterations for 95% CI (reduce if you need faster)
RANDOM_STATE = 0
# -----------------------------

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "figures"), exist_ok=True)

# Load
df = pd.read_csv(INPUT_CSV)
# Auto-detect target
target_col = None
for c in df.columns:
    low = c.lower()
    if "intend" in low and "technology" in low:
        target_col = c; break
if target_col is None:
    # fallback - pick candidate with "intend" in name
    for c in df.columns:
        if "intend" in c.lower():
            target_col = c; break
if target_col is None:
    raise ValueError("Target column not found. Columns: " + ", ".join(df.columns[:30]))

# Drop obvious identifiers
drop_cols = [c for c in df.columns if "timestamp" in c.lower() or "email" in c.lower() or "name" in c.lower() or "enter your" in c.lower()]
df_clean = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Binarize the target: Yes -> 1, others -> 0
y = df_clean[target_col].astype(str).str.lower().str.contains("yes").astype(int)

# Feature selection heuristic (same as earlier): pick informative survey columns
candidates = []
for col in df_clean.columns:
    low = col.lower()
    if col == target_col:
        continue
    if any(k in low for k in ["age","gender","living","residence","income","cgpa","prior","program","family encourage","parent","hours","proficiency","interest in technology"]):
        candidates.append(col)
# fallback: take first 12 non-free-text columns
if len(candidates) < 6:
    candidates = [c for c in df_clean.columns if c != target_col][:12]

X = df_clean[candidates].copy()
# Clean textual values for categorical columns
for c in X.select_dtypes(include='object').columns:
    X[c] = X[c].fillna("missing").astype(str).str.strip().str.replace(r"\s+", "_", regex=True).str.lower()

# Identify feature types
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()

# Preprocessor
cat_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                            ('onehot', OneHotEncoder(handle_unknown='ignore'))])
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),
                            ('scaler', StandardScaler())])
preprocessor = ColumnTransformer([('num', num_transformer, num_cols),
                                  ('cat', cat_transformer, cat_cols)])

# Models (attempt to include everything reviewer mentioned)
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear', random_state=RANDOM_STATE),
    'DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, class_weight='balanced'),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'KNN': KNeighborsClassifier(),
    'NaiveBayes': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=RANDOM_STATE)
}
if HAS_XGB:
    models['XGBoost'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
if HAS_LGB:
    models['LightGBM'] = lgb.LGBMClassifier(random_state=RANDOM_STATE)
if HAS_CAT:
    models['CatBoost'] = cb.CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)

# Utility functions
def stratified_cv_metrics(pipe, X, y, folds=5):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=RANDOM_STATE)
    accs, f1s, aucs = [], [], []
    for tr, te in skf.split(X, y):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y.iloc[tr], y.iloc[te]

        # Check if the classifier is GaussianNB and handle sparse data
        if isinstance(pipe.named_steps['clf'], GaussianNB):
            pre = pipe.named_steps['pre']
            Xtr_t = pre.transform(Xtr).toarray()
            Xte_t = pre.transform(Xte).toarray()
            clf = pipe.named_steps['clf']
            clf.fit(Xtr_t, ytr)
            yp = clf.predict(Xte_t)
            prob = clf.predict_proba(Xte_t)[:,1] if hasattr(clf, 'predict_proba') else np.full(len(yte), np.nan)

        else:
            pipe.fit(Xtr, ytr)
            yp = pipe.predict(Xte)
            prob = pipe.predict_proba(Xte)[:,1] if hasattr(pipe, 'predict_proba') else np.full(len(yte), np.nan)

        accs.append(accuracy_score(yte, yp))
        f1s.append(f1_score(yte, yp, zero_division=0))
        try:
             aucs.append(roc_auc_score(yte, prob))
        except ValueError:
             aucs.append(float('nan')) # Handle case where only one class is present

    return np.array(accs), np.array(f1s), np.array(aucs)

def bootstrap_ci(arr, iters=500):
    arr = np.asarray(arr, dtype=np.float64)  # Convert to NumPy array, coerce errors to NaN
    arr = arr[~np.isnan(arr)]
    if len(arr)==0:
        return (float('nan'), float('nan'))
    bs = [np.mean(np.random.choice(arr, size=len(arr), replace=True)) for _ in range(iters)]
    return (np.percentile(bs, 2.5), np.percentile(bs, 97.5))

# Run experiments: No-resampling and Resampling (SMOTE if available; otherwise simple upsampling)
summary_rows = []
trained_holdout_models = {}
X_full = X.copy()
y_full = y.copy()

for scenario in ['no_resample', 'resample']:
    for mname, m in models.items():
        pipe = Pipeline([('pre', preprocessor), ('clf', m)])
        if scenario == 'no_resample':
            accs, f1s, aucs = stratified_cv_metrics(pipe, X_full, y_full, folds=FOLDS)
        else:
            # manual resampling within CV folds (safe, explicit)
            skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
            accs, f1s, aucs = [], [], []
            for tr, te in skf.split(X_full, y_full):
                Xtr, Xte = X_full.iloc[tr].copy(), X_full.iloc[te].copy()
                ytr, yte = y_full.iloc[tr].copy(), y_full.iloc[te].copy()
                train = pd.concat([Xtr, ytr.rename('target')], axis=1)
                maj = train[train['target']==train['target'].mode()[0]]
                minr = train[train['target']!=train['target'].mode()[0]]
                if len(minr)==0:
                    Xres, yres = Xtr, ytr
                else:
                    if HAS_IMBLEARN:
                        from imblearn.over_sampling import SMOTE
                        sm = SMOTE(random_state=RANDOM_STATE)
                        # apply SMOTE to numeric encoding only; but easiest here is to one-hot transform first
                        # we will fit the pipeline's preprocessor on Xtr, transform, SMOTE the array, then fit the classifier
                        pre = preprocessor.fit(Xtr, ytr)
                        Xtr_t = pre.transform(Xtr)
                        X_res_t, y_res = sm.fit_resample(Xtr_t, ytr)
                        # Now create a pipeline that uses the already-fitted preprocessor
                        # We'll fit classifier directly on transformed features by bypassing pipeline for CV folds
                        clf = m
                        try:
                            # Handle sparse data for GaussianNB after SMOTE
                            if isinstance(clf, GaussianNB):
                                clf.fit(X_res_t.toarray(), y_res)
                            else:
                                clf.fit(X_res_t, y_res)

                            # evaluate: transform Xte
                            Xte_t = pre.transform(Xte)
                            # Handle sparse data for GaussianNB prediction
                            if isinstance(clf, GaussianNB):
                                yp = clf.predict(Xte_t.toarray())
                                prob = clf.predict_proba(Xte_t.toarray())[:,1] if hasattr(clf, 'predict_proba') else np.full(len(yte), np.nan)
                            else:
                                yp = clf.predict(Xte_t)
                                prob = clf.predict_proba(Xte_t)[:,1] if hasattr(clf, 'predict_proba') else np.full(len(yte), np.nan)

                            accs.append(accuracy_score(yte, yp))
                            f1s.append(f1_score(yte, yp, zero_division=0))
                            try:
                                 aucs.append(roc_auc_score(yte, prob))
                            except ValueError:
                                 aucs.append(float('nan')) # Handle case where only one class is present
                            continue
                        except Exception:
                            # fall back to simple upsample below
                            pass
                    # simple upsampling fallback
                    from sklearn.utils import resample
                    minr_up = resample(minr, replace=True, n_samples=len(maj), random_state=RANDOM_STATE)
                    train_res = pd.concat([maj, minr_up])
                    yres = train_res['target']
                    Xres = train_res.drop(columns=['target'])
                    pipe.fit(Xres, yres)
                    yp = pipe.predict(Xte)
                    prob = pipe.predict_proba(Xte)[:,1] if hasattr(pipe, 'predict_proba') else np.full(len(yte), np.nan)
                    accs.append(accuracy_score(yte, yp))
                    f1s.append(f1_score(yte, yp, zero_division=0))
                    try:
                        aucs.append(roc_auc_score(yte, prob))
                    except ValueError:
                        aucs.append(float('nan')) # Handle case where only one class is present

                    continue
        # metrics summary
        acc_mean, acc_std = float(np.nanmean(accs)), float(np.nanstd(accs))
        f1_mean, f1_std = float(np.nanmean(f1s)), float(np.nanstd(f1s))
        auc_mean, auc_std = float(np.nanmean(aucs)), float(np.nanstd(aucs))
        acc_ci = bootstrap_ci(accs, iters=BOOTSTRAP_ITERS)
        f1_ci = bootstrap_ci(f1s, iters=BOOTSTRAP_ITERS)
        auc_ci = bootstrap_ci(aucs[~np.isnan(aucs)], iters=BOOTSTRAP_ITERS) if not np.all(np.isnan(aucs)) else (float('nan'), float('nan'))
        summary_rows.append({
            'scenario': scenario,
            'model': mname,
            'accuracy_mean': acc_mean, 'accuracy_std': acc_std, 'accuracy_ci95': acc_ci,
            'f1_mean': f1_mean, 'f1_std': f1_std, 'f1_ci95': f1_ci,
            'auc_mean': auc_mean, 'auc_std': auc_std, 'auc_ci95': auc_ci
        })

# Save summary CSV
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(os.path.join(OUT_DIR, "cv_summary.csv"), index=False)

# Simple train-test split to create example confusion matrices (and permutation importance)
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, stratify=y_full, test_size=0.2, random_state=RANDOM_STATE)

confusion_dict = {}
perm_files = []
for mname, m in models.items():
    pipe = Pipeline([('pre', preprocessor), ('clf', m)])
    # fit with simple upsampling to help minority capture
    train = pd.concat([X_train, y_train.rename('target')], axis=1)
    maj = train[train['target']==train['target'].mode()[0]]
    minr = train[train['target']!=train['target'].mode()[0]]
    if len(minr)>0:
        from sklearn.utils import resample
        minr_up = resample(minr, replace=True, n_samples=len(maj), random_state=RANDOM_STATE)
        train_res = pd.concat([maj, minr_up])
        yres = train_res['target']; Xres = train_res.drop(columns=['target'])

        # Handle sparse data for GaussianNB during holdout training
        if isinstance(pipe.named_steps['clf'], GaussianNB):
            pre = pipe.named_steps['pre']
            Xres_t = pre.transform(Xres).toarray()
            clf = pipe.named_steps['clf']
            clf.fit(Xres_t, yres)
        else:
            pipe.fit(Xres, yres)
    else:
        # Handle sparse data for GaussianNB during holdout training (no resampling)
        if isinstance(pipe.named_steps['clf'], GaussianNB):
            pre = pipe.named_steps['pre']
            X_train_t = pre.transform(X_train).toarray()
            clf = pipe.named_steps['clf']
            clf.fit(X_train_t, y_train)
        else:
            pipe.fit(X_train, y_train)

    # Handle sparse data for GaussianNB prediction on holdout
    if isinstance(pipe.named_steps['clf'], GaussianNB):
        pre = pipe.named_steps['pre']
        X_test_t = pre.transform(X_test).toarray()
        clf = pipe.named_steps['clf']
        yp = clf.predict(X_test_t)
    else:
        yp = pipe.predict(X_test)

    cm = confusion_matrix(y_test, yp)
    confusion_dict[mname] = cm.tolist()
    # permutation importance
    try:
        Xt = pipe.named_steps['pre'].transform(X_test)
        # Handle sparse data for permutation importance
        if isinstance(pipe.named_steps['clf'], GaussianNB):
             Xt = Xt.toarray()
        clf = pipe.named_steps['clf']

        imp = permutation_importance(clf, Xt, y_test, n_repeats=10, random_state=RANDOM_STATE)
        try:
            feat_names = pipe.named_steps['pre'].get_feature_names_out()
        except:
            feat_names = [f"f{i}" for i in range(len(imp.importances_mean))]
        imp_df = pd.DataFrame({'feature': feat_names, 'importance_mean': imp.importances_mean, 'importance_std': imp.importances_std})
        imp_df = imp_df.sort_values('importance_mean', ascending=False).head(30)
        fname = os.path.join(OUT_DIR, f"perm_{mname}.csv")
        imp_df.to_csv(fname, index=False)
        perm_files.append(fname)
        # plot top 10
        top = imp_df.head(10).iloc[::-1]
        plt.figure(figsize=(6,4))
        plt.barh(top['feature'], top['importance_mean'])
        plt.xlabel("Permutation importance (mean)")
        plt.title(f"Permutation importance - {mname}")
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR, "figures", f"perm_{mname}.png"), dpi=300)
        plt.close()
    except Exception as e:
        pass

with open(os.path.join(OUT_DIR, "confusion_matrices.json"), "w") as f:
    json.dump(confusion_dict, f, indent=2)

# POLICY SIMULATION (toy): increase rural->urban for 10% of rural respondents in test set and measure predicted mean prob change using LogisticRegression
living_col = None
for c in X.columns:
    if "living" in c.lower() or "residence" in c.lower():
        living_col = c; break
policy = None
if living_col and 'LogisticRegression' in models:
    pipe_lr = Pipeline([('pre', preprocessor), ('clf', models['LogisticRegression'])])
    pipe_lr.fit(X_train, y_train)
    Xtest_copy = X_test.copy()
    rural_mask = Xtest_copy[living_col].astype(str).str.contains("rural")
    n_rural = int(rural_mask.sum())
    if n_rural>0:
        idx = Xtest_copy[rural_mask].sample(frac=0.1, random_state=RANDOM_STATE).index
        mean_before = float(pipe_lr.predict_proba(Xtest_copy)[:,1].mean())
        X_alt = Xtest_copy.copy()
        X_alt.loc[idx, living_col] = "urban"
        mean_after = float(pipe_lr.predict_proba(X_alt)[:,1].mean())
        policy = {'living_col': living_col, 'n_rural': n_rural, 'changed': int(len(idx)), 'mean_prob_before': mean_before, 'mean_prob_after': mean_after, 'delta': mean_after-mean_before}
        with open(os.path.join(OUT_DIR, "policy_simulation.json"), "w") as f:
            json.dump(policy, f, indent=2)

print("Done. Outputs saved in:", OUT_DIR)
print("If shap is installed, you can rerun with SHAP explainers for richer feature explainability.")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 120, number of negative: 30
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.800000 -> initscore=1.386294
[LightGBM] [Info] Start training from score 1.386294
[LightGBM] [Info] Number of positive: 120, number of negative: 30
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 108
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.800000 -> initscore=1.386294
[LightGBM] [Info] Sta



[LightGBM] [Info] Number of positive: 120, number of negative: 31
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 104
[LightGBM] [Info] Number of data points in the train set: 151, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.794702 -> initscore=1.353505
[LightGBM] [Info] Start training from score 1.353505




TypeError: only integer scalar arrays can be converted to a scalar index