In [3]:
import pandas as pd
import joblib

def load_and_prepare_data(features_path: str, include_labels: bool = False) -> pd.DataFrame:
    """
    Load and preprocess data from a specified file path.

    Parameters:
        features_path (str): The path to the features CSV file.
        include_labels (bool): Whether to include labels in the returned DataFrame.

    Returns:
        pd.DataFrame: The loaded and prepared DataFrame.
    """
    # Load data
    data = pd.read_csv(features_path)
    # Assume the features to use are defined here (ensure these are the same features used in training)
    features = ['hue', 'saturation', 'value', 'hsv_uniformity', 'compactness_score', 'Vertical Asymmetry_mean', 'Horizontal Asymmetry_mean']
    if include_labels:
        return data[['img_id'] + features + ['cancer']]
    else:
        return data[['img_id'] + features]

def predict_cancer_likelihood(model_path: str, data_path: str) -> pd.DataFrame:
    """
    Load a trained model and data, predict cancer likelihood scores, and include cancer labels.

    Parameters:
        model_path (str): The path to the trained model file.
        data_path (str): The path to the data CSV file.

    Returns:
        pd.DataFrame: A DataFrame with image IDs, cancer likelihood scores, and cancer labels.
    """
    # Load the trained model
    model = joblib.load(model_path)

    # Load and prepare the data
    data = load_and_prepare_data(data_path, include_labels=True)

    # Extract image IDs and features
    img_ids = data['img_id']
    features = data.drop(['img_id', 'cancer'], axis=1)

    # Predict probabilities for class 1 (cancer likely)
    predictions = model.predict_proba(features)[:, 1]

    # Create a DataFrame with image IDs, predictions, and cancer labels
    results = pd.DataFrame({
        'img_id': img_ids,
        'cancer_likelihood': predictions,
        'cancer': data['cancer']
    })

    return results

# Example usage
model_path = 'features/training/final_model'
data_path = 'features/features.csv'
results = predict_cancer_likelihood(model_path, data_path)
print(results)


                     img_id  cancer_likelihood  cancer
0      PAT_104_1754_276.png           0.662003       0
1      PAT_104_1755_320.png           0.652769       0
2       PAT_108_162_526.png           0.609628       0
3       PAT_108_162_660.png           0.585026       0
4       PAT_108_424_944.png           0.539339       0
...                     ...                ...     ...
1267  PAT_2066_4417_201.png           0.524420       0
1268  PAT_2085_4506_726.png           0.491896       0
1269  PAT_2092_4537_438.png           0.528571       0
1270  PAT_2152_4781_332.png           0.248778       0
1271  PAT_2155_4784_483.png           0.254867       0

[1272 rows x 3 columns]


In [4]:
def calculate_and_print_accuracy(results: pd.DataFrame) -> None:
    """
    Calculates and prints the overall accuracy of predictions where the likelihood > 0.50 corresponds to actual cancer cases.

    Parameters:
        results (pd.DataFrame): A DataFrame with columns 'cancer_likelihood' and 'cancer' indicating predicted probabilities and actual outcomes.
    """
    # Convert probabilities to binary predictions based on the threshold
    results['predicted_cancer'] = (results['cancer_likelihood'] > 0.50).astype(int)
    
    # Calculate the accuracy: the proportion of correct predictions
    accuracy = (results['predicted_cancer'] == results['cancer']).mean()
    
    print(f"Overall Accuracy (Threshold > 0.50): {accuracy:.2%}")

# Example usage (to be added where you have your results DataFrame ready)
# calculate_and_print_accuracy(results)
calculate_and_print_accuracy(results)

Overall Accuracy (Threshold > 0.50): 67.06%
