In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# --- Step 1: Load and Preprocess the Data ---

print("Step 1: Loading and Preprocessing the Data")
print("="*40)

# Load the CSV file
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA.csv')

print(f"Original data loaded. It has {df.shape[0]} rows and {df.shape[1]} columns.")

# Initialize the encoder
# sparse_output=False means it will return a regular NumPy array
# handle_unknown='ignore' will prevent errors if new/unseen data appears
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

print("Starting one-hot encoding...")

# Fit and transform the data
df_processed_array = encoder.fit_transform(df)

# Get the new column names
processed_columns = encoder.get_feature_names_out(df.columns)

# Convert back to a DataFrame
df_processed = pd.DataFrame(df_processed_array, columns=processed_columns)

print(f"Data has been preprocessed. New shape: {df_processed.shape}")
print("\nHead of the new, processed data:")
print(df_processed.head())

# Save the processed data to a new CSV file for future use
df_processed.to_csv('DATA_processed.csv', index=False)

print("\nProcessed data saved to 'DATA_processed.csv'.")
print("--- End of Step 1 ---")

Step 1: Loading and Preprocessing the Data
Original data loaded. It has 19900 rows and 16 columns.
Starting one-hot encoding...
Data has been preprocessed. New shape: (19900, 256)

Head of the new, processed data:
   f1_A  f1_B  f1_C  f1_D  f1_E  f1_F  f1_G  f1_H  f1_I  f1_J  ...  f16_G  \
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
1   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
2   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
3   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
4   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   

   f16_H  f16_I  f16_J  f16_K  f16_L  f16_M  f16_N  f16_O  f16_P  
0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  
1    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    1.0  
2    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  
3    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split  # <-- MOVED THIS LINE TO THE TOP

# --- Step 2: Split the Data ---

print("Step 2: Splitting the Data")
print("="*40)

# Load the processed data from Step 1
try:
    df_processed = pd.read_csv('DATA_processed.csv')
    # If using Google Drive, you might need:
    # df_processed = pd.read_csv('/content/drive/MyDrive/DATA_processed.csv')
except FileNotFoundError:
    print("Error: DATA_processed.csv not found.")
    print("Please make sure you successfully ran Step 1.")
    print("If you are using Google Drive, you may need to update the path.")

print(f"Loaded processed data with shape: {df_processed.shape}")

# Split the data
# We'll use an 80/20 split.
# test_size=0.2 means 20% of data goes to the test set
# random_state=42 ensures you get the same "random" split every time
# you run this. This is good for reproducible results.

train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)

print("\nData has been split.")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape    : {test_df.shape}")

# Save these new dataframes to CSV files
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

# If using Google Drive, you might want to save to your Drive:
# train_df.to_csv('/content/drive/MyDrive/train_data.csv', index=False)
# test_df.to_csv('/content/drive/MyDrive/test_data.csv', index=False)

print("\nTraining and test data saved to 'train_data.csv' and 'test_data.csv'.")
print("--- End of Step 2 ---")

Step 2: Splitting the Data
Loaded processed data with shape: (19900, 256)

Data has been split.
Training data shape: (15920, 256)
Test data shape    : (3980, 256)

Training and test data saved to 'train_data.csv' and 'test_data.csv'.
--- End of Step 2 ---


In [None]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import joblib
from tqdm import tqdm
import warnings

# --- New Step 3: INTRAIN for Independent MLPs (with Hyperparameter Tuning) ---

print("Step 3 (Improved): Training Independent MLPs with Hyperparameter Tuning")
print("="*70)

def INTRAIN_independent(train_df):
    """
    Wraps the Independent MLP training logic in a function.
    Uses GridSearchCV to tune hyperparameters.
    """

    print(f"Loaded training data with shape: {train_df.shape}")

    # Get all column names
    all_cols = train_df.columns.tolist()

    # Get the original feature names (e.g., 'f1', 'f2', ..., 'f16')
    original_features = []
    for col in all_cols:
        feature_prefix = col.split('_')[0] # This gets 'f1' from 'f1_A'
        if feature_prefix not in original_features:
            original_features.append(feature_prefix)

    print(f"Found {len(original_features)} original features to tune.")

    # --- Hyperparameter Tuning Setup ---
    # We will test two different architectures for each model
    param_grid = {
        'hidden_layer_sizes': [(50, 50), (100,)], # Test a 2-layer vs 1-layer network
        'alpha': [0.0001, 0.001] # Test two different regularization strengths
    }

    # This dictionary will hold our 16 *best* trained models
    independent_models = {}

    print("\nStarting to train and tune 16 separate MLP models...")
    print(f"Parameter grid: {param_grid}")
    print("This will take a long time...")

    # Use tqdm for a progress bar
    for feature in tqdm(original_features, desc="Tuning models"):

        # 1. Select the target columns (Y)
        target_cols = [col for col in all_cols if col.startswith(feature + '_')]

        # 2. Select the input columns (X)
        input_cols = [col for col in all_cols if col not in target_cols]

        # 3. Create the X_train and y_train
        X_train = train_df[input_cols]
        y_train = train_df[target_cols]

        # 4. Define the base MLP Model
        # We leave out the parameters that we are tuning (hidden_layer_sizes, alpha)
        mlp = MLPClassifier(max_iter=200,
                            random_state=42,
                            early_stopping=True, # Use early stopping for speed
                            n_iter_no_change=10)

        # 5. Define the GridSearchCV
        # cv=3 means 3-fold cross-validation (addresses Point #2)
        # n_jobs=-1 uses all your CPU cores to speed up the grid search
        grid_search = GridSearchCV(mlp,
                                   param_grid,
                                   cv=3,
                                   n_jobs=-1,
                                   verbose=0) # Set verbose=1 for more details

        # 6. Train the model (this runs the search)
        with warnings.catch_warnings():
            # Suppress convergence warnings during the grid search
            warnings.simplefilter("ignore")
            grid_search.fit(X_train, y_train)

        # 7. Store the *best* trained model
        independent_models[feature] = grid_search.best_estimator_

        # print(f"  Best params for {feature}: {grid_search.best_params_}") # Uncomment for details

    print("\nAll 16 models have been tuned and trained.")

    # The function returns the dictionary of models
    return independent_models

# --- Main script execution ---

# 1. Load the clean, processed training data
train_data_path = 'train_data.csv'
try:
    train_df = pd.read_csv(train_data_path)
except FileNotFoundError:
    print(f"Error: '{train_data_path}' not found.")
    print("Please make sure you successfully ran Step 2.")
    raise

# 2. Call our new function to do all the work
independent_models = INTRAIN_independent(train_df)

# 3. Save all trained models to a single file
independent_model_path = 'independent_model_tuned.joblib' # New file name
joblib.dump(independent_models, independent_model_path)

print(f"\nAll tuned models saved to '{independent_model_path}'")
print("--- End of New Step 3 ---")


Step 3 (Improved): Training Independent MLPs with Hyperparameter Tuning
Loaded training data with shape: (15920, 256)
Found 16 original features to tune.

Starting to train and tune 16 separate MLP models...
Parameter grid: {'hidden_layer_sizes': [(50, 50), (100,)], 'alpha': [0.0001, 0.001]}
This will take a long time...


Tuning models: 100%|██████████| 16/16 [34:18<00:00, 128.64s/it]


All 16 models have been tuned and trained.

All tuned models saved to 'independent_model_tuned.joblib'
--- End of New Step 3 ---





In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
import joblib
from tqdm import tqdm

# --- New Step 4: INTRAIN for Conjunct MLP (in a Function) ---

print("Step 4 (Improved): Training the Conjunct MLP Model (in a Function)")
print("="*65)

def INTRAIN_conjunct(train_df, corruption_level=0.2):
    """
    Wraps the Denoising Autoencoder (Conjunct) training logic in a function.
    """

    print(f"Loaded training data with shape: {train_df.shape}")

    # --- Create Corrupted Data ---
    print("Creating corrupted (denoising) training data...")
    train_array_clean = train_df.values
    train_array_corrupted = train_array_clean.copy()

    total_cells = train_array_corrupted.size
    num_to_corrupt = int(total_cells * corruption_level)

    rows = np.random.randint(0, train_array_corrupted.shape[0], size=num_to_corrupt)
    cols = np.random.randint(0, train_array_corrupted.shape[1], size=num_to_corrupt)

    train_array_corrupted[rows, cols] = 0
    print(f"Created corrupted data. {num_to_corrupt} cells (20%) were set to 0.")

    X_train_corrupted = train_array_corrupted
    y_train_clean = train_array_clean

    # --- Define and Train the MLP ---
    print("\nStarting to train the single Conjunct MLP...")

    # We use our strong baseline model from the first run.
    conjunct_mlp = MLPClassifier(hidden_layer_sizes=(150,),
                                 max_iter=200,
                                 random_state=42,
                                 verbose=True) # Set to True to see training progress

    conjunct_mlp.fit(X_train_corrupted, y_train_clean)

    print("Conjunct MLP has been trained.")

    return conjunct_mlp

# --- Main script execution ---

# 1. Load the clean, processed training data
train_data_path = 'train_data.csv'
try:
    train_df = pd.read_csv(train_data_path)
except FileNotFoundError:
    print(f"Error: '{train_data_path}' not found.")
    print("Please make sure you successfully ran Step 2.")
    raise

# 2. Call our new function to do all the work
conjunct_model = INTRAIN_conjunct(train_df)

# 3. Save the trained model
conjunct_model_path = 'conjunct_model.joblib' # We can reuse the old name
joblib.dump(conjunct_model, conjunct_model_path)

print(f"Model saved to '{conjunct_model_path}'")
print("--- End of New Step 4 ---")


Step 4 (Improved): Training the Conjunct MLP Model (in a Function)
Loaded training data with shape: (15920, 256)
Creating corrupted (denoising) training data...
Created corrupted data. 815104 cells (20%) were set to 0.

Starting to train the single Conjunct MLP...
Iteration 1, loss = 82.95710611
Iteration 2, loss = 43.78410946
Iteration 3, loss = 36.37634401
Iteration 4, loss = 29.99633228
Iteration 5, loss = 24.83921322
Iteration 6, loss = 20.84660906
Iteration 7, loss = 17.87050956
Iteration 8, loss = 15.69210711
Iteration 9, loss = 14.09324560
Iteration 10, loss = 12.91043338
Iteration 11, loss = 12.00891321
Iteration 12, loss = 11.30851340
Iteration 13, loss = 10.75470058
Iteration 14, loss = 10.30570878
Iteration 15, loss = 9.93443329
Iteration 16, loss = 9.61258242
Iteration 17, loss = 9.34227858
Iteration 18, loss = 9.11465437
Iteration 19, loss = 8.90907418
Iteration 20, loss = 8.72604685
Iteration 21, loss = 8.56620665
Iteration 22, loss = 8.42060261
Iteration 23, loss = 8.284



In [None]:
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm

# --- New Step 5: Define the QUERY Functions ---

print("Step 5 (Improved): Defining the QUERY functions")
print("="*55)

# --- 1. Helper Function to Create "Broken" Data ---
# This function is identical to our old one. It pokes holes in
# a clean dataset to create our `In?ut`.

def create_broken_data(df, corruption_level=0.2):
    print(f"\nCreating 'broken' data with {corruption_level*100}% missing values...")

    all_cols = df.columns.tolist()

    original_features = []
    for col in all_cols:
        feature_prefix = col.split('_')[0]
        if feature_prefix not in original_features:
            original_features.append(feature_prefix)

    df_corrupted = df.copy()

    num_samples = df.shape[0]
    num_features = len(original_features)
    num_to_corrupt = int(num_samples * num_features * corruption_level)

    print(f"Total original features to corrupt: {num_to_corrupt}")

    for _ in tqdm(range(num_to_corrupt), desc="Poking holes"):
        row_idx = np.random.randint(0, num_samples)
        feature_to_corrupt = np.random.choice(original_features)

        cols_to_corrupt = [col for col in all_cols if col.startswith(feature_to_corrupt + '_')]
        df_corrupted.loc[row_idx, cols_to_corrupt] = 0

    print("Finished creating 'broken' data.")
    return df_corrupted

# --- 2. Define the QUERY Function for Independent MLPs ---

def QUERY_independent(model_dict, input_data):
    """
    Imputes missing values using the (tuned) INDEPENDENT model.
    - model_dict: The dictionary of 16 tuned models.
    - input_data: The "broken" dataframe (with 0s for missing features).
    """
    print("\n[Independent] Starting imputation...")

    imputed_data = input_data.copy()
    all_cols = imputed_data.columns.tolist()
    original_features = list(model_dict.keys())

    for i, row in tqdm(input_data.iterrows(), total=input_data.shape[0], desc="[Independent] Imputing"):

        for feature in original_features:
            feature_cols = [col for col in all_cols if col.startswith(feature + '_')]

            # Check if this feature is missing (all its one-hot columns are 0)
            if row[feature_cols].sum() == 0:
                # This feature is missing! Let's predict it.

                model_to_use = model_dict[feature]
                input_cols = [col for col in all_cols if col not in feature_cols]
                x_query = row[input_cols].values.reshape(1, -1)

                # Predict the missing feature
                predicted_feature_onehot = model_to_use.predict(x_query)

                # Fill in the imputed data
                imputed_data.loc[i, feature_cols] = predicted_feature_onehot[0]

    print("[Independent] Imputation complete.")
    return imputed_data

# --- 3. Define the QUERY Function for Conjunct MLP ---

def QUERY_conjunct(model_path, input_data):
    """
    Imputes missing values using the CONJUNCT model.
    - model_path: The filename of the single trained model.
    - input_data: The "broken" dataframe.
    """
    print("\n[Conjunct] Starting imputation...")

    try:
        model = joblib.load(model_path)
    except FileNotFoundError:
        print(f"Error: Model file '{model_path}' not found.")
        return None

    print("[Conjunct] Loaded model.")
    input_array = input_data.values

    print("[Conjunct] Predicting/imputing all values...")
    imputed_array = model.predict(input_array)

    imputed_data = pd.DataFrame(imputed_array, columns=input_data.columns)

    print("[Conjunct] Imputation complete.")
    return imputed_data

# --- Main script execution (Preparation) ---

# 1. Load the clean test data
test_data_path = 'test_data.csv'
try:
    test_df_clean = pd.read_csv(test_data_path)
except FileNotFoundError:
    print(f"Error: '{test_data_path}' not found. Please run Step 2.")
    raise

# 2. Create the "broken" test data
#    (We'll save it to use the *exact* same broken file for both models)
broken_test_data_path = 'test_data_broken.csv'
test_df_broken = create_broken_data(test_df_clean, corruption_level=0.2)
test_df_broken.to_csv(broken_test_data_path, index=False)

print(f"\nSaved 'broken' test set to '{broken_test_data_path}'.")
print("All QUERY functions are defined.")
print("--- End of New Step 5 & 6 ---")


Step 5 (Improved): Defining the QUERY functions

Creating 'broken' data with 20.0% missing values...
Total original features to corrupt: 12736


Poking holes: 100%|██████████| 12736/12736 [00:14<00:00, 850.82it/s]


Finished creating 'broken' data.

Saved 'broken' test set to 'test_data_broken.csv'.
All QUERY functions are defined.
--- End of New Step 5 & 6 ---


In [None]:
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm

# --- New Step 7: Run Experiment and Analyze Results (Per-Feature) ---

print("Step 7 (Improved): Running Final Experiment with Per-Feature Analysis")
print("="*70)

# --- 1. Redefine QUERY Functions (to be safe) ---
# (We include them here just in case the notebook was restarted)

def QUERY_independent(model_dict, input_data):
    """
    Imputes missing values using the (tuned) INDEPENDENT model.
    """
    print("\n[Independent] Starting imputation...")
    imputed_data = input_data.copy()
    all_cols = imputed_data.columns.tolist()
    original_features = list(model_dict.keys())

    for i, row in tqdm(input_data.iterrows(), total=input_data.shape[0], desc="[Independent] Imputing"):
        for feature in original_features:
            feature_cols = [col for col in all_cols if col.startswith(feature + '_')]
            if row[feature_cols].sum() == 0:
                model_to_use = model_dict[feature]
                input_cols = [col for col in all_cols if col not in feature_cols]
                x_query = row[input_cols].values.reshape(1, -1)
                predicted_feature_onehot = model_to_use.predict(x_query)
                imputed_data.loc[i, feature_cols] = predicted_feature_onehot[0]
    print("[Independent] Imputation complete.")
    return imputed_data

def QUERY_conjunct(model_path, input_data):
    """
    Imputes missing values using the CONJUNCT model.
    """
    print("\n[Conjunct] Starting imputation...")
    try:
        model = joblib.load(model_path)
    except FileNotFoundError:
        print(f"Error: Model file '{model_path}' not found.")
        return None

    print("[Conjunct] Loaded model.")
    input_array = input_data.values
    print("[Conjunct] Predicting/imputing all values...")
    imputed_array = model.predict(input_array)
    imputed_data = pd.DataFrame(imputed_array, columns=input_data.columns)
    print("[Conjunct] Imputation complete.")
    return imputed_data

# --- 2. Load All Data and Models ---

print("\nLoading all data and models for the experiment...")

try:
    test_df_clean = pd.read_csv('test_data.csv')
    test_df_broken = pd.read_csv('test_data_broken.csv')
    independent_model_dict = joblib.load('independent_model_tuned.joblib')
    conjunct_model_path = 'conjunct_model.joblib'
except FileNotFoundError as e:
    print(f"Error loading file: {e}")
    print("Please ensure all previous steps have been run successfully.")

print("All files loaded.")

# --- 3. Run QUERY for Both Models ---

# Run Independent Model
imputed_independent_df = QUERY_independent(independent_model_dict, test_df_broken)

# Run Conjunct Model
imputed_conjunct_df = QUERY_conjunct(conjunct_model_path, test_df_broken)

# --- 4. NEW Detailed Per-Feature Analysis ---

print("\n--- Final Analysis (Per-Feature Breakdown) ---")
print("="*70)

# Get the list of original features
all_cols = test_df_clean.columns.tolist()
original_features = []
for col in all_cols:
    feature_prefix = col.split('_')[0]
    if feature_prefix not in original_features:
        original_features.append(feature_prefix)

# We need to find the *exact* rows/features that were broken
# We do this by checking the sum of the one-hot vectors.
# If the sum is 0 in the broken set, it was a missing feature.
original_feature_accuracies = []

for feature in original_features:
    feature_cols = [col for col in all_cols if col.startswith(feature + '_')]

    # 1. Find the rows where this feature was missing
    # We get the sum of the one-hot columns for this feature
    broken_sums = test_df_broken[feature_cols].sum(axis=1)
    # A sum of 0 means this feature was missing
    missing_mask = (broken_sums == 0)

    # 2. Get the number of rows we need to check
    total_missing = missing_mask.sum()
    if total_missing == 0:
        # No rows were missing this feature, skip
        continue

    # 3. Get the "Answer Key" for these rows
    clean_answers = test_df_clean.loc[missing_mask, feature_cols]

    # 4. Get the models' predictions for these same rows
    independent_preds = imputed_independent_df.loc[missing_mask, feature_cols]
    conjunct_preds = imputed_conjunct_df.loc[missing_mask, feature_cols]

    # 5. Check for correct matches
    # We must compare the DataFrames element by element
    independent_correct = (independent_preds.values == clean_answers.values).all(axis=1).sum()
    conjunct_correct = (conjunct_preds.values == clean_answers.values).all(axis=1).sum()

    # 6. Calculate accuracy for this feature
    acc_ind = independent_correct / total_missing
    acc_con = conjunct_correct / total_missing

    original_feature_accuracies.append({
        "Feature": feature,
        "Ind_Acc": acc_ind,
        "Con_Acc": acc_con,
        "Total_Missing": total_missing
    })

# --- 5. Print the Final Report Table ---

# Convert the results to a DataFrame for easy viewing
results_df = pd.DataFrame(original_feature_accuracies)

print("Per-Feature Imputation Accuracy:")
print(results_df.to_markdown(index=False, floatfmt=".2%"))

# --- 6. Print Overall Accuracy (like before) ---

# We can also calculate the total accuracy by averaging the feature accuracies
# (weighted by how many were missing)
total_missing_all = results_df['Total_Missing'].sum()
total_ind_correct = (results_df['Ind_Acc'] * results_df['Total_Missing']).sum()
total_con_correct = (results_df['Con_Acc'] * results_df['Total_Missing']).sum()

overall_acc_ind = total_ind_correct / total_missing_all
overall_acc_con = total_con_correct / total_missing_all

print("\n--- OVERALL RESULTS ---")
print("="*70)
print(f"Independent MLP Accuracy (Tuned): {overall_acc_ind * 100:.2f}%")
print(f"Conjunct MLP Accuracy (Baseline): {overall_acc_con * 100:.2f}%")
print("="*70)

print("\nThis concludes the experiment!")
print("You now have the detailed, per-feature breakdown for your report.")


Step 7 (Improved): Running Final Experiment with Per-Feature Analysis

Loading all data and models for the experiment...
All files loaded.

[Independent] Starting imputation...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Independent] Imputing: 100%|██████████| 3980/3980 [01:24<00:00, 47.21it/s]


[Independent] Imputation complete.

[Conjunct] Starting imputation...
[Conjunct] Loaded model.
[Conjunct] Predicting/imputing all values...
[Conjunct] Imputation complete.

--- Final Analysis (Per-Feature Breakdown) ---
Per-Feature Imputation Accuracy:
| Feature   |   Ind_Acc |   Con_Acc |   Total_Missing |
|:----------|----------:|----------:|----------------:|
| f1        |    51.30% |    31.65% |             692 |
| f2        |    35.58% |    25.74% |             742 |
| f3        |    44.43% |    25.49% |             718 |
| f4        |    54.39% |    42.25% |             684 |
| f5        |    43.65% |    25.00% |             724 |
| f6        |    37.11% |    26.33% |             733 |
| f7        |    38.78% |    26.94% |             735 |
| f8        |    29.06% |    15.66% |             709 |
| f9        |    30.17% |    14.80% |             696 |
| f10       |    37.66% |    22.57% |             709 |
| f11       |    39.34% |    26.04% |             722 |
| f12       |    34