# PISA 2022 Amazon SageMaker KNN

More info on SageMaker Immersion Day: [Workshop Link](https://catalog.us-east-1.prod.workshops.aws/workshops/63069e26-921c-4ce1-9cc7-dd882ff62575/en-US/lab2-model-training/pro-code)


### ***Change country name below!***

In [1]:
country_name = 'United_States'

In [2]:
country_name_edited = country_name.replace("_", "-")

In [3]:
# cell 02
import sagemaker
bucket=sagemaker.Session().default_bucket()
prefix = 'sagemaker/knn-'+country_name_edited
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


Now let's bring in the Python libraries that we'll use throughout the analysis

In [4]:
# cell 03
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions

#### Download PISA 2022 Prepared Dataset

This is our dataset output from our cleaned notebook [here](https://7z4vtvpqcoxouiu.studio.us-west-2.sagemaker.aws/jupyterlab/default/lab/tree/RTC%3Amids-capstone/notebooks/eda/Data_merging.ipynb)


In [None]:
%%time 

# cell 06

# Define local file path
local_file_path = "PISA_cleaned_dataset.csv"  # Change as needed

# Define S3 details
bucket_name = "sagemaker-us-west-2-986030204467"
file_key = "capstone/testfiles/PISA_cleaned_dataset.csv"

# Check if the file exists locally
if os.path.exists(local_file_path):
    print("📂 Loading data from local file...")
    data = pd.read_csv(local_file_path, usecols=None)
    
else:
    print("☁️ Downloading data from S3...")
    
    # Create S3 client
    s3_client = boto3.client("s3")

    # Download the file from S3
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)

    # Read the file into pandas DataFrame
    data = pd.read_csv(response["Body"], usecols=None)

    # Save a local copy for future use
    data.to_csv(local_file_path, index=False)
    print(f"✅ File saved locally as {local_file_path}")

# Display first few rows
#data.head()

pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 20)         # Keep the output on one page
data

☁️ Downloading data from S3...


#### Download dictionary for the variable names

In [None]:
# Download the file from S3
s3_client = boto3.client("s3")
dictionary_file = s3_client.get_object(Bucket=bucket_name, Key="capstone/testfiles/Variable_dictionary.csv")

# Read the file into pandas DataFrame
dictionary = pd.read_csv(dictionary_file["Body"], usecols=None)

#### Subset the data to a specific COUNTRY

In [None]:
model_data = data[data['CNT'] == country_name]
print(model_data.shape)
model_data.head()

#### Take out additional variables

In [None]:
# Define the list of columns to drop
columns_to_remove = ["CNT", "CNTSCHID", "CNTSTUID", "OECD",
    "HOMEPOS", "RELATST", "BELONG", "BULLIED", "FEELSAFE", "SCHRISK", "PERSEVAGR", "CURIOAGR", 
    "COOPAGR", "EMPATAGR", "ASSERAGR", "STRESAGR", "EMOCOAGR", "GROSAGR", "INFOSEEK", "FAMSUP", 
    "DISCLIM", "TEACHSUP", "COGACRCO", "COGACMCO", "EXPOFA", "EXPO21ST", "MATHEFF", "MATHEF21", 
    "FAMCON", "ANXMAT", "MATHPERS", "CREATEFF", "CREATSCH", "CREATFAM", "CREATAS", "CREATOOS", 
    "CREATOP", "OPENART", "IMAGINE", "SCHSUST", "LEARRES", "PROBSELF", "FAMSUPSL", "FEELLAH", 
    "SDLEFF", "ICTRES", "FLSCHOOL", "FLMULTSB", "FLFAMILY", "ACCESSFP", "FLCONFIN", "FLCONICT", 
    "ACCESSFA", "ATTCONFM", "FRINFLFM", "ICTSCH", "ICTHOME", "ICTQUAL", "ICTSUBJ", "ICTENQ", 
    "ICTFEED", "ICTOUT", "ICTWKDY", "ICTWKEND", "ICTREG", "ICTINFO", "ICTEFFIC", "BODYIMA", 
    "SOCONPA", "LIFESAT", "PSYCHSYM", "SOCCON", "EXPWB", "CURSUPP", "PQMIMP", "PQMCAR", 
    "PARINVOL", "PQSCHOOL", "PASCHPOL", "ATTIMMP", "CREATHME", "CREATACT", "CREATOPN", 
    "CREATOR", "SCHAUTO", "TCHPART", "EDULEAD", "INSTLEAD", "ENCOURPG", "DIGDVPOL", "TEAFDBK", 
    "MTTRAIN", "DMCVIEWS", "NEGSCLIM", "STAFFSHORT", "EDUSHORT", "STUBEHA", "TEACHBEHA", 
    "STDTEST", "TDTEST", "ALLACTIV", "BCREATSC", "CREENVSC", "ACTCRESC", "OPENCUL", 
    "PROBSCRI", "SCPREPBP", "SCPREPAP", "DIGPREP", 
    "ESCS", "BMMJ1", "BFMJ2", "EFFORT1", "EFFORT2", "Option_UH", "SC209Q04JA", "SC209Q05JA", "SC209Q06JA"
]

# Drop the columns above
model_data = model_data.drop(columns=columns_to_remove, errors='ignore')  # `errors='ignore'` prevents errors if a column isn't found


In [None]:
print(model_data.shape)
model_data.head()

Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format.  **Note that the first column must be the target variable and the CSV should not include headers.**  Although repetitive, it's easiest to do this after the train|validation|test split rather than before.  This avoids any misalignment issues due to random reordering.
* `MATH_Proficient`: Is the student falling behind in Math? (Average of 10 Math plausible values < 420.07)

In [None]:
# Get percent of students not proficient in Math
proficient_n = (model_data['MATH_Proficient'] == 1).sum()
not_proficient_n = (model_data['MATH_Proficient'] == 0).sum()
not_proficient_p = round( not_proficient_n / (not_proficient_n + proficient_n) * 100, 1)
print("Students who are NOT proficient in Math: ", not_proficient_n, "(", not_proficient_p, "%)")

In [None]:
# Get imbalance ratio 
not_proficient_pp = not_proficient_n / (not_proficient_n + proficient_n)

if not_proficient_pp < 0.5:
    imbalance_ratio = (1 - not_proficient_pp) / not_proficient_pp
else:
    imbalance_ratio = not_proficient_pp / (1 - not_proficient_pp)
    
print("Imbalance ratio:", round(imbalance_ratio,1))

In [None]:
# Reorder columns to bring 'MATH_Proficient' first
new_order = ['MATH_Proficient'] + [col for col in model_data.columns if col != 'MATH_Proficient']
model_data = model_data[new_order]

# Get number of features
n_features_original = model_data.shape[1]-1

# Check the shape after dropping
print(model_data.shape)

model_data.head()

#### Drop columns with more than 20% missing values

***I commented out the code below because KNN might be able to work with datasets with missing values (like xgboost). If it yells at you that it can't handle missing values (which is what happened for linear learner), uncomment and run the codes below.***

In [None]:
#model_data.dropna(thresh=int(0.8 * len(model_data)), axis=1, inplace=True)
#print(model_data.shape)

In [None]:
#n_features_final = model_data.shape[1]-1
#print("Number of features (before dropping features with more than 20% missing):", n_features_original)
#print("Number of features (after dropping features with more than 20% missing):", n_features_final)
#print("Number of features with more than 20% missing:", n_features_original - n_features_final)

#### For columns with less than 20% missing values, fill missing values with the median value of the column

In [None]:
#model_data.fillna(model_data.median(), inplace=True)

We'll randomly split the data into 3 uneven groups.  **The model will be trained on 70% of data, it will then be evaluated on 15% of data to give us an estimate of the accuracy we hope to have on "new" data, and 15% will be held back as a final testing dataset which will be used later on.**

A seed is included in the code so the splits can be replicated!

In [None]:
# cell 12
# Randomly sort the data then split out first 70%, second 15%, and last 15%
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.85 * len(model_data))])   

In [None]:
print("Number of rows in FULL dataset:", model_data.shape[0])

train_data_percent = round(train_data.shape[0]/model_data.shape[0] * 100, 0)
print("Number of rows in TRAINING dataset:", train_data.shape[0], "(", train_data_percent, "% )")

validation_data_percent = round(validation_data.shape[0]/model_data.shape[0] * 100, 0)
print("Number of rows in VALIDATION dataset:", validation_data.shape[0], "(", validation_data_percent, "% )")

test_data_percent = round(test_data.shape[0]/model_data.shape[0] * 100, 0)
print("Number of rows in TEST dataset:", test_data.shape[0], "(", test_data_percent, "% )")

In [None]:
# Save train dataset 
train_data.to_csv('train.csv', index=False, header=False)

# Save validation dataset 
validation_data.to_csv('validation.csv', index=False, header=False)


In [None]:
# Training data - Saved later to S3 as CSV
print(train_data.shape)
train_data.head()

In [None]:
# Validation data - Saved later to S3 as CSV
print(validation_data.shape)
validation_data.head()

In [None]:
# Test data - NOT SAVED TO S3
print(test_data.shape)
test_data.head()

Now we'll copy the file to S3 for Amazon SageMaker's managed training to pickup.

In [None]:
# cell 14
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

## Training 

***In the code below, you should change "xgboost" to something that works for knn***

In [None]:

import sagemaker
from sagemaker import get_execution_role

# Set up SageMaker role and session
role = get_execution_role()
sagemaker_session = sagemaker.Session()

# Retrieve the built-in SageMaker KNN image
container = sagemaker.image_uris.retrieve(
    framework='knn', 
    region=boto3.Session().region_name
)


Then, because we're training with the CSV file format, we'll create `s3_input`s that our training function can use as a pointer to the files in S3, which also specify that the content type is CSV.

***In the code below, you might have to change "text/csv" to something else, depending on how knn works***

In [None]:

s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket, prefix), 
    content_type='text/csv'
)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data='s3://{}/{}/validation'.format(bucket, prefix), 
    content_type='text/csv'
)


***In the code below, you should change "linear-learner" to something that works for knn***

In [None]:

import sagemaker
from sagemaker import get_execution_role

# Set up SageMaker role and session
role = get_execution_role()
sagemaker_session = sagemaker.Session()

# Retrieve the built-in SageMaker KNN image
container = sagemaker.image_uris.retrieve(
    framework='knn', 
    region=boto3.Session().region_name
)


In [None]:

knn_estimator.set_hyperparameters(
    k=10,  # Number of nearest neighbors
    sample_size=5000,  # Size of the sample used for training
    predictor_type='classifier',  # 'classifier' or 'regressor'
    feature_dim=feature_dim,  # Number of features
    index_metric='COSINE'  # Distance metric
)


#### Use auto-tuning to find best hyperparameters

***In the code below, change the hyperparameters to something that is relevant to KNN***

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

mini_batch_upper_limit = int(train_data.shape[0]*0.16)

hyperparameter_ranges = {'mini_batch_size': IntegerParameter(30, mini_batch_upper_limit),
                         'learning_rate': ContinuousParameter(0.001, 0.01),
                         'wd': ContinuousParameter(0.0001, 0.01),
                         'l1': ContinuousParameter(0.0001, 0.01)}


***In the code below, you might have to change "validation:roc_auc_score" to something else that works for KNN***

In [None]:
tuner = HyperparameterTuner(estimator=knn,
                            objective_metric_name='validation:roc_auc_score',
                            hyperparameter_ranges=hyperparameter_ranges,
                            max_jobs=50,  
                            max_parallel_jobs=5)

# May need to adjust number of jobs depending on budget!

In [None]:

s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket, prefix), 
    content_type='text/csv'
)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data='s3://{}/{}/validation'.format(bucket, prefix), 
    content_type='text/csv'
)


In [None]:
# cell 26
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

In [None]:
# cell 27
# Return the best training job name
best_training_job = tuner.best_training_job()
print("Best training job:", best_training_job)

In [None]:

knn_estimator.set_hyperparameters(
    k=10,  # Number of nearest neighbors
    sample_size=5000,  # Size of the sample used for training
    predictor_type='classifier',  # 'classifier' or 'regressor'
    feature_dim=feature_dim,  # Number of features
    index_metric='COSINE'  # Distance metric
)


## Deploy the model (the best model identified by HyperparameterTuner)

In [None]:

knn_predictor = knn_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)


In [None]:

from sagemaker.serializers import CSVSerializer

knn_predictor.serializer = CSVSerializer()


Now, we'll use a simple function to:
1. Loop over our test dataset
1. Split it into mini-batches of rows 
1. Convert those mini-batches to CSV string payloads (notice, we drop the target variable from our dataset first)
1. Retrieve mini-batch predictions by invoking the XGBoost endpoint
1. Collect predictions and convert from the CSV output our model provides into a NumPy array

In [None]:
# Get the raw prediction output
raw_predictions = knn_predictor.predict(test_data.drop(['MATH_Proficient'], axis=1).to_numpy())

# Decode and parse JSON
parsed_predictions = json.loads(raw_predictions.decode("utf-8"))

# Extract the scores
predictions = np.array([pred["score"] for pred in parsed_predictions["predictions"]])


In [None]:
# Save the real values for the test set
real_values = test_data['MATH_Proficient']
real_values.to_csv('real_values.csv', index=False, header=False)

# Save the predicted values for the test set
predicted_values_full = predictions
predicted_values_full = pd.DataFrame(predicted_values_full, columns=['Predicted Values'])
predicted_values_full.to_csv('predicted_values_full.csv', index=False, header=False)

In [None]:
# Clean up
knn_predictor.delete_endpoint(delete_endpoint_config=True)

## Explain the trained model using Clarify

In [None]:
from datetime import datetime

session = sagemaker.Session()

model_name = "Clarify-{}-{}".format(country_name_edited, datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))

best_model = sagemaker.estimator.Estimator.attach(best_training_job)  # Attach the best training job

model = best_model.create_model(name=model_name)  # Create a model from the best job

container_def = model.prepare_container_def()

session.create_model(model_name, role, container_def)

In [None]:
test_features = test_data.drop(["MATH_Proficient"], axis=1)
test_target = test_data["MATH_Proficient"]
test_features.to_csv("test_features.csv", index=False, header=False)

***In the code below, you might have to change "text/csv" to something else that works for KNN***

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role, instance_count=1, instance_type="ml.m5.2xlarge", sagemaker_session=session
)

model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type="ml.m5.large",
    instance_count=1,
    accept_type="text/csv",
    content_type="text/csv",
)

In [None]:
from sagemaker.s3 import S3Downloader

# Download data from S3 to local instance
local_path = S3Downloader.download('s3://{}/{}/train'.format(bucket, prefix), './tmp/train_data')

In [None]:
# Load and sample
full_data = pd.read_csv('./tmp/train_data/train.csv', header=None)
n = min(3000, len(full_data))  
sampled_data = full_data.sample(n=n)  # If full_data has less than n, use the full sample

# Save sampled data back to S3
sampled_path = 'sampled_train_data.csv'
sampled_data.to_csv(sampled_path, index=False)

from sagemaker.s3 import S3Uploader
sampled_s3_uri = S3Uploader.upload(sampled_path, 's3://{}/{}/sampled_train'.format(bucket, prefix))

In [None]:
print(sampled_data.shape)
sampled_data.head()

***In the code below, you might have to change "text/csv" to something else that works for KNN***

In [None]:
shap_config = clarify.SHAPConfig(
    baseline=[test_features.iloc[0].values.tolist()],
    num_samples=3000,  
    agg_method="mean_abs",
    save_local_shap_values=True
)

explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, prefix)

explainability_data_config = clarify.DataConfig(
    #s3_data_input_path='s3://{}/{}/train'.format(bucket, prefix),
    s3_data_input_path=sampled_s3_uri,
    s3_output_path=explainability_output_path,
    label='MATH_Proficient',
    headers=train_data.columns.to_list(),
    dataset_type="text/csv",
)

In [None]:
# Set logging level for 'sagemaker.clarify' to WARNING (hides INFO messages)
import logging

logging.getLogger("sagemaker.clarify").setLevel(logging.WARNING)

clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config
)

## Train the model again with the top 20 predictors
#### Get the list of top 20 predictors

In [None]:
# Replace with your actual bucket name and prefix used in explainability_output_path
# bucket = "your-bucket-name"
# prefix = "your-prefix"  # e.g., the folder structure used in your explainability_output_path

# Construct the S3 key for the output file
key = f"{prefix}/clarify-explainability/analysis.json"

# Initialize boto3 client for S3 and download the JSON report
s3 = boto3.client("s3")
response = s3.get_object(Bucket=bucket, Key=key)
content = response["Body"].read().decode("utf-8")
report = json.loads(content)

# Navigate to the global SHAP values dictionary
global_shap = report["explanations"]["kernel_shap"]["label0"]["global_shap_values"]

# Sort the items by the SHAP value in descending order and take the top 20
top_20 = sorted(global_shap.items(), key=lambda item: item[1], reverse=True)[:20]

# Extract just the feature names
top_20_features = [feature for feature, value in top_20]

# Print
print("Top 20 features with the highest mean absolute SHAP values:")
for feature in top_20_features:
    print(feature)


In [None]:
# Make a subset of the training dataset (with only 20 predictors)
variables_to_keep = ["MATH_Proficient"] + top_20_features
train_data_small = train_data[variables_to_keep]
print(train_data_small.shape)
train_data_small.head()

In [None]:
# Save train dataset 
train_data_small.to_csv('train_small.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train_small/train_small.csv')).upload_file('train_small.csv')

In [None]:
# Make a subset of the validation dataset (with only 20 predictors)
validation_data_small = validation_data[variables_to_keep]
print(validation_data_small.shape)
validation_data_small.head()

In [None]:
# Save validation dataset 
validation_data_small.to_csv('validation_small.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation_small/validation_small.csv')).upload_file('validation_small.csv')

#### Train the model using the hyperparameters from the best model

***In the code below, you should change "xgboost" to something else that works for KNN***

In [None]:

import sagemaker
from sagemaker import get_execution_role

# Set up SageMaker role and session
role = get_execution_role()
sagemaker_session = sagemaker.Session()

# Retrieve the built-in SageMaker KNN image
container = sagemaker.image_uris.retrieve(
    framework='knn', 
    region=boto3.Session().region_name
)


***In the code below, you might have to change "text/csv" to something else that works for KNN***

In [None]:

s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket, prefix), 
    content_type='text/csv'
)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data='s3://{}/{}/validation'.format(bucket, prefix), 
    content_type='text/csv'
)


In [None]:

knn_estimator.set_hyperparameters(
    k=10,  # Number of nearest neighbors
    sample_size=5000,  # Size of the sample used for training
    predictor_type='classifier',  # 'classifier' or 'regressor'
    feature_dim=feature_dim,  # Number of features
    index_metric='COSINE'  # Distance metric
)


## Deploy the model

In [None]:
test_data_small = test_data[variables_to_keep]

In [None]:
# cell 18
knn_small_predictor = knn_small.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

In [None]:
# cell 19
knn_small_predictor.serializer = sagemaker.serializers.CSVSerializer()

Now, we'll use a simple function to:
1. Loop over our test dataset
1. Split it into mini-batches of rows 
1. Convert those mini-batches to CSV string payloads (notice, we drop the target variable from our dataset first)
1. Retrieve mini-batch predictions by invoking the XGBoost endpoint
1. Collect predictions and convert from the CSV output our model provides into a NumPy array

In [None]:
# Get the raw prediction output
raw_predictions_small = knn_small_predictor.predict(test_data_small.drop(['MATH_Proficient'], axis=1).to_numpy())

# Decode and parse JSON
parsed_predictions_small = json.loads(raw_predictions_small.decode("utf-8"))

# Extract the scores
predictions_small = np.array([pred["score"] for pred in parsed_predictions_small["predictions"]])

In [None]:
# Save the predicted values for the test set
predicted_values_small = predictions_small
predicted_values_small = pd.DataFrame(predicted_values_small, columns=['Predicted Values'])
predicted_values_small.to_csv('predicted_values_small.csv', index=False, header=False)

In [None]:
# Clean up
knn_small_predictor.delete_endpoint(delete_endpoint_config=True)

## Summary

#### Number of students not proficient in Math

In [None]:
#print("Students who are proficient: ", proficient_n)
print("Students who are NOT proficient in Math: ", not_proficient_n, "(", not_proficient_p, "%)")

#### Model performance (model with all the predictors)

In [None]:
suggested_threshold = (100 - not_proficient_p)/100
print("Suggested threshold:", round(suggested_threshold, 2))

***Adjust the threhold for the FINAL PREDICTIONS if necessary!!*** 

The model will predict as Math_proficient if the probability is above this threhold. (If the threshold is above 0.5, it will reduce the number of students predicted as "Math proficient" for both students that are actually proficient and not proficient in Math.)

In [None]:
threshold = 0.68

print("Threshold:", threshold)

In [None]:
import pandas as pd
import numpy as np

# Read in the real values
real_values = pd.read_csv('real_values.csv', usecols=[0], header=None)
real_values = real_values.values.ravel()

# Read in the predicted values (using the full model)
predicted_values_full = pd.read_csv('predicted_values_full.csv', usecols=[0], header=None)
predicted_values_full = predicted_values_full.values.ravel()

In [None]:
cm = pd.crosstab(index=real_values, 
                 columns=np.round( (predicted_values_full >= threshold).astype(int) ), 
                 rownames=['actuals'], 
                 colnames=['predictions'])

TN = cm.loc[0.0, 0.0]
FP = cm.loc[0.0, 1.0]
FN = cm.loc[1.0, 0.0]
TP = cm.loc[1.0, 1.0]

accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
precision = TP / (TP + FP) * 100 if (TP + FP) > 0 else 0
recall = TP / (TP + FN) * 100 if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
specificity = TN / (TN + FP) * 100 if (TN + FP) > 0 else 0

print("MODEL USING ALL FEATURES \n")
print(cm)

print("\nAccuracy: {:.1f}".format(accuracy))
print("F1 Score: {:.1f}".format(f1_score))
print("Precision: {:.1f}".format(precision))
print("Recall: {:.1f}".format(recall))
print("Specificity: {:.1f}".format(specificity))

### Model performance (model with 20 predictors)

In [None]:
# Read in the predicted values (using 20 predictors)
predicted_values_small = pd.read_csv('predicted_values_small.csv', usecols=[0], header=None)
predicted_values_small = predicted_values_small.values.ravel()

In [None]:
cm_small = pd.crosstab(index=real_values, 
                       columns=np.round( (predicted_values_small >= threshold).astype(int) ), 
                       rownames=['actuals'], 
                       colnames=['predictions'])

TN_small = cm_small.loc[0.0, 0.0]
FP_small = cm_small.loc[0.0, 1.0]
FN_small = cm_small.loc[1.0, 0.0]
TP_small = cm_small.loc[1.0, 1.0]

accuracy_small = (TP_small + TN_small) / (TP_small + TN_small + FP_small + FN_small) * 100
precision_small = TP_small / (TP_small + FP_small) * 100 if (TP_small + FP_small) > 0 else 0
recall_small = TP_small / (TP_small + FN_small) * 100 if (TP_small + FN_small) > 0 else 0
f1_score_small = 2 * (precision_small * recall_small) / (precision_small + recall_small) if (precision_small + recall_small) > 0 else 0
specificity_small = TN_small / (TN_small + FP_small) * 100 if (TN_small + FP_small) > 0 else 0

print("MODEL USING 20 FEATURES \n")
print(cm_small)

print("\nAccuracy: {:.1f}".format(accuracy_small))
print("F1 Score: {:.1f}".format(f1_score_small))
print("Precision: {:.1f}".format(precision_small))
print("Recall: {:.1f}".format(recall_small))
print("Specificity: {:.1f}".format(specificity_small))

#### Top 20 features

In [None]:
pd.set_option('display.max_colwidth', None)
from IPython.display import display, Markdown

# Filter the DataFrame to only include rows where Variable_name is in top_20_features
top_20_dictionary = dictionary[dictionary["Variable_name"].isin(top_20_features)]
top_20_table = top_20_dictionary.set_index("Variable_name").loc[top_20_features].reset_index()
display(Markdown(top_20_table.to_markdown()))