In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("adult.csv")


# choosing the discerete features
categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

target_col = "income"     

df = df[categorical_features + [target_col]]


# stripping whitespace from categorical features and target column
# making ? a separate category to include it in calculations
for col in categorical_features + [target_col]:
    df[col] = df[col].astype(str).str.strip()

df = df.fillna("?")


In [None]:
feature_mappings = {}

def encode_column(series):
    """
    Encode a pandas Series of strings into integers and
    return the encoded series + mapping dict.
    """
    unique_values = sorted(series.unique())  # sort for consistency
    mapping = {value: idx for idx, value in enumerate(unique_values)}
    encoded = series.map(mapping)
    return encoded, mapping

# Encode all categorical features
for col in categorical_features:
    encoded_col, mapping = encode_column(df[col])
    df[col] = encoded_col
    feature_mappings[col] = mapping

# Encode target (income) to 0/1
# Example: '<=50K' -> 0, '>50K' -> 1
target_values = sorted(df[target_col].unique())
target_mapping = {value: idx for idx, value in enumerate(target_values)}
df[target_col] = df[target_col].map(target_mapping)

print("Target mapping:", target_mapping)
print("\nFeature mappings example (first feature):")
first_feat = categorical_features[0]
print(first_feat, ":", feature_mappings[first_feat])


#  Split into 70% / 15% / 15% (stratified)

X = df[categorical_features].values
y = df[target_col].values

# First: train (70%) vs temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42,
)

# Second: split temp into validation (15%) and test (15%)
# 0.15 / 0.30 = 0.5  → so we split temp 50/50
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42,
)

print("\nShapes:")
print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)


# 6) Analyze class distribution
def describe_class_distribution(y, name):
    unique, counts = pd.Series(y).value_counts().sort_index().index, pd.Series(y).value_counts().sort_index().values
    total = len(y)
    print(f"\nClass distribution in {name}:")
    for cls, cnt in zip(unique, counts):
        pct = cnt / total * 100
        # Map back to original label using inverse mapping
        inv_target_mapping = {v: k for k, v in target_mapping.items()}
        print(f"  Class {cls} ({inv_target_mapping[cls]}): {cnt} samples ({pct:.2f}%)")

describe_class_distribution(y_train, "TRAIN")
describe_class_distribution(y_val, "VALIDATION")
describe_class_distribution(y_test, "TEST")


#  Analyze feature–target relationships (on full data or train only)


full_df = df.copy()
full_df[target_col] = full_df[target_col].map({v: k for k, v in target_mapping.items()})  # back to original labels

print("\nFeature–target relationships (examples):")

for col in ["education", "marital-status", "workclass"]:
    print(f"\n=== {col} vs income ===")
    # Map ints back to categories for readability
    inv_map = {v: k for k, v in feature_mappings[col].items()}
    temp = full_df[[col, target_col]].copy()
    temp[col] = temp[col].map(inv_map)
    
    # Crosstab with row-normalized percentages
    ct = pd.crosstab(temp[col], temp[target_col], normalize='index') * 100
    print(ct.round(2))

In [None]:
#B2 

import numpy as np

def train_naive_bayes_discrete(X_train, y_train, n_classes, alpha=1.0):
 
    N, d = X_train.shape

    #Class counts and priors P(C_k)
    class_counts = np.bincount(y_train, minlength=n_classes)  # shape (n_classes,)
    class_priors = (class_counts + alpha) / (N + alpha * n_classes)  # shape (n_classes,)

    #  For each feature j, we need number of possible values V_j
    n_values_per_feature = []
    for j in range(d):
        n_values_per_feature.append(int(X_train[:, j].max()) + 1)  # assume values start at 0

    #  Feature likelihood counts: count[x_j = v, class = k]
    # We'll store as a list of arrays: feature_counts[j].shape = (V_j, n_classes)
    feature_counts = []
    for j in range(d):
        V_j = n_values_per_feature[j]
        counts_j = np.zeros((V_j, n_classes), dtype=np.float64)
        feature_counts.append(counts_j)

    # Fill counts
    for i in range(N):
        x = X_train[i]
        c = y_train[i]
        for j in range(d):
            v = x[j]
            feature_counts[j][v, c] += 1.0

    # 4) Convert counts to probabilities with Laplace smoothing:
    # P(x_j = v | C_k) = (count + alpha) / (class_counts[k] + alpha * V_j)
    feature_likelihoods = []
    for j in range(d):
        V_j = n_values_per_feature[j]
        counts_j = feature_counts[j]  # (V_j, n_classes)
        probs_j = np.zeros_like(counts_j)
        for k in range(n_classes):
            probs_j[:, k] = (counts_j[:, k] + alpha) / (class_counts[k] + alpha * V_j)
        feature_likelihoods.append(probs_j)

    model = {
        "class_priors": class_priors,                
        "feature_likelihoods": feature_likelihoods,  
        "n_values_per_feature": n_values_per_feature
    }
    return model


In [None]:
def predict_log_proba(model, X):
    
    class_priors = model["class_priors"]
    feature_likelihoods = model["feature_likelihoods"]

    X = np.asarray(X, dtype=np.int64)
    N, d = X.shape
    n_classes = class_priors.shape[0]

    log_priors = np.log(class_priors)  # (n_classes,)
    log_probs = np.zeros((N, n_classes), dtype=np.float64)

    for n in range(N):
        x = X[n]
        log_p = log_priors.copy()
        for j in range(d):
            v = x[j]
            probs_j = feature_likelihoods[j]  # shape (V_j, n_classes)
            log_p += np.log(probs_j[v, :])
        log_probs[n, :] = log_p

    return log_probs

def predict(model, X):
    log_probs = predict_log_proba(model, X)
    return np.argmax(log_probs, axis=1)


In [None]:
from sklearn.metrics import accuracy_score

alphas = [0.1, 0.5, 1.0, 2.0, 5.0]
val_results = []

for alpha in alphas:
    model = train_naive_bayes_discrete(X_train, y_train, n_classes, alpha=alpha)
    y_val_pred = predict(model, X_val)
    val_acc = accuracy_score(y_val, y_val_pred)
    val_results.append((alpha, val_acc))
    print(f"alpha = {alpha}, validation accuracy = {val_acc:.4f}")

best_alpha, best_val_acc = max(val_results, key=lambda t: t[1])
print("\nBest alpha:", best_alpha, "with validation accuracy:", best_val_acc)


In [None]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

# Merge train + val
X_train_full = np.vstack([X_train, X_val])
y_train_full = np.concatenate([y_train, y_val])

# Train final model with best alpha
best_model = train_naive_bayes_discrete(X_train_full, y_train_full, n_classes, alpha=best_alpha)
y_test_pred = predict(best_model, X_test)

def evaluate_all_metrics(y_true, y_pred, name=""):
    print(f"\n===== {name} =====")
    
    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f}")
    
    # Per-class precision/recall/F1
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None
    )
    print("\nPer-class metrics:")
    for i, (p, r, f, s) in enumerate(zip(precision, recall, f1, support)):
        print(f"Class {i}: precision={p:.4f}, recall={r:.4f}, f1={f:.4f}, support={s}")
    
    # Macro-averaged metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro"
    )
    print("\nMacro-averaged:")
    print(f"Precision_macro={precision_macro:.4f}")
    print(f"Recall_macro   ={recall_macro:.4f}")
    print(f"F1_macro       ={f1_macro:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion matrix (rows=true, cols=pred):")
    print(cm)
    
    # Optional pretty text report
    print("\nClassification report:")
    print(classification_report(y_true, y_pred))
    
    return {
        "accuracy": acc,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "confusion_matrix": cm,
    }

metrics_test = evaluate_all_metrics(y_test, y_test_pred, name="Our Naive Bayes (Test)")


In [None]:
from sklearn.naive_bayes import MultinomialNB

sk_model = MultinomialNB(alpha=best_alpha)
sk_model.fit(X_train_full, y_train_full)
y_test_pred_sk = sk_model.predict(X_test)

metrics_test_sk = evaluate_all_metrics(y_test, y_test_pred_sk, name="sklearn MultinomialNB (Test)")


In [None]:
def log_probs_to_probs(log_probs):
    # log_probs shape: (N, n_classes)
    max_log = np.max(log_probs, axis=1, keepdims=True)
    stabilized = log_probs - max_log
    exp_vals = np.exp(stabilized)
    probs = exp_vals / np.sum(exp_vals, axis=1, keepdims=True)
    return probs

log_probs_test = predict_log_proba(best_model, X_test)
probs_test = log_probs_to_probs(log_probs_test)  # shape (N, n_classes)

# probability of class 1 (e.g., >50K) for each sample
p_class1 = probs_test[:, 1]

print("\nProbability analysis for class 1 (>50K):")
print("Mean probability:", p_class1.mean())
print("Min probability :", p_class1.min())
print("Max probability :", p_class1.max())

# simple bins: how confident is the model?
bins = [0.0, 0.5, 0.7, 0.9, 1.0]
counts = np.histogram(p_class1, bins=bins)[0]
total = len(p_class1)
for (b1, b2), c in zip(zip(bins[:-1], bins[1:]), counts):
    print(f"P(class=1) in [{b1:.1f}, {b2:.1f}): {c} samples ({c/total*100:.2f}%)")
