### Inference with saved model

In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.manifold import TSNE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the test metadata
test_metadata = pd.read_csv("/kaggle/input/isic-2024-challenge/test-metadata.csv", low_memory=False)

# Load the preprocessor and the best model from the hyperparameter tuning
preprocessor = joblib.load('/kaggle/input/isic-metadata-pkl-files/preprocessor.pkl')
best_mlp = joblib.load('/kaggle/input/isic-metadata-pkl-files/best_mlp_model.pkl')

# Preprocess the test data
def preprocess_metadata(metadata, preprocessor=None, is_train=True):
    # Columns specific to training data
    drop_cols = ['mel_mitotic_index', 'mel_thick_mm', 'iddx_1', 'lesion_id', 'iddx_2', 'iddx_full', 'iddx_4', 'iddx_5', 'iddx_3', 'target', 'tbp_lv_dnn_lesion_confidence']
    
    metadata = metadata.drop(columns=[col for col in drop_cols if col in metadata.columns])

    # Define categorical and numerical columns
    categorical_cols = metadata.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = metadata.select_dtypes(include=['number']).columns.tolist()

    if preprocessor is None:
        # One-hot encode categorical variables and scale numerical variables
        preprocessor = ColumnTransformer(
            transformers=[
                ('num_imputer', SimpleImputer(strategy='mean'), numerical_cols),
                ('num_scaler', StandardScaler(), numerical_cols),
                ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_cols)])

        return preprocessor.fit(metadata), preprocessor.transform(metadata)
    else:
        metadata_imputed = preprocessor.named_transformers_['num_imputer'].transform(metadata[numerical_cols])
        metadata[numerical_cols] = metadata_imputed
        metadata_scaled = preprocessor.named_transformers_['num_scaler'].transform(metadata[numerical_cols])
        metadata[numerical_cols] = metadata_scaled
        metadata_encoded = preprocessor.named_transformers_['cat'].transform(metadata[categorical_cols])
        return np.hstack((metadata_scaled, metadata_encoded))

test_metadata_processed = preprocess_metadata(test_metadata, preprocessor=preprocessor, is_train=False)

# Ensure no NaN values remain
test_metadata_processed = np.nan_to_num(test_metadata_processed)

# Apply t-SNE for dimensionality reduction
def apply_tsne(X, n_components=2):
    tsne = TSNE(n_components=n_components, perplexity=min(30, X.shape[0] - 1))
    X_reduced = tsne.fit_transform(X)
    return X_reduced

# Apply t-SNE on the processed test data
test_X_reduced = apply_tsne(test_metadata_processed)

# Predict using the best MLP model
test_predictions = best_mlp.predict_proba(test_X_reduced)[:, 1]

# Print the test predictions to verify
print(test_predictions)

# Prepare the submission file
submission = pd.DataFrame({
    'isic_id': test_metadata['isic_id'],
    'target': test_predictions
})

# Ensure the filename is correct
submission.to_csv("submission.csv", index=False)
submission

[0.9017287 0.0043425 0.9868462]
