In [1]:
from xgboost import XGBClassifier

random_state = 42

In [2]:
%%time

import pandas as pd

train = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')
test = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv')
sample_submission = pd.read_csv('/kaggle/input/isic-2024-challenge/sample_submission.csv')

print("train shape -", train.shape)
print("test shape - ", test.shape)



train shape - (401059, 55)
test shape -  (3, 44)
CPU times: user 4.6 s, sys: 504 ms, total: 5.11 s
Wall time: 7.28 s


In [3]:
cols = test.columns
# cols = list(cols).append("target")

In [4]:
col_names = list(cols)
col_names.append("target")
print(col_names)

['isic_id', 'patient_id', 'age_approx', 'sex', 'anatom_site_general', 'clin_size_long_diam_mm', 'image_type', 'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'attribution', 'copyright_license', 'target']


In [5]:

# keep just the test columns
train = train[col_names]

In [6]:
%%time

# Check for missing values in each dataset
missing_values_train = train.isnull().sum()
missing_values_test = test.isnull().sum()
missing_values_sample_submission = sample_submission.isnull().sum()

missing_values_train, missing_values_test, missing_values_sample_submission

CPU times: user 385 ms, sys: 2.73 ms, total: 388 ms
Wall time: 383 ms


(isic_id                            0
 patient_id                         0
 age_approx                      2798
 sex                            11517
 anatom_site_general             5756
 clin_size_long_diam_mm             0
 image_type                         0
 tbp_tile_type                      0
 tbp_lv_A                           0
 tbp_lv_Aext                        0
 tbp_lv_B                           0
 tbp_lv_Bext                        0
 tbp_lv_C                           0
 tbp_lv_Cext                        0
 tbp_lv_H                           0
 tbp_lv_Hext                        0
 tbp_lv_L                           0
 tbp_lv_Lext                        0
 tbp_lv_areaMM2                     0
 tbp_lv_area_perim_ratio            0
 tbp_lv_color_std_mean              0
 tbp_lv_deltaA                      0
 tbp_lv_deltaB                      0
 tbp_lv_deltaL                      0
 tbp_lv_deltaLB                     0
 tbp_lv_deltaLBnorm                 0
 tbp_lv_ecce

In [7]:
%%time

# Replace NaN with 0 in all datasets
train_filled = train.fillna(0)
test_filled = test.fillna(0)
sample_submission_filled = sample_submission.fillna(0)


train_filled, test_filled, sample_submission_filled

CPU times: user 556 ms, sys: 44.2 ms, total: 600 ms
Wall time: 545 ms


(             isic_id  patient_id  age_approx     sex anatom_site_general  \
 0       ISIC_0015670  IP_1235828        60.0    male     lower extremity   
 1       ISIC_0015845  IP_8170065        60.0    male           head/neck   
 2       ISIC_0015864  IP_6724798        60.0    male     posterior torso   
 3       ISIC_0015902  IP_4111386        65.0    male      anterior torso   
 4       ISIC_0024200  IP_8313778        55.0    male      anterior torso   
 ...              ...         ...         ...     ...                 ...   
 401054  ISIC_9999937  IP_1140263        70.0    male      anterior torso   
 401055  ISIC_9999951  IP_5678181        60.0    male     posterior torso   
 401056  ISIC_9999960  IP_0076153        65.0  female      anterior torso   
 401057  ISIC_9999964  IP_5231513        30.0  female      anterior torso   
 401058  ISIC_9999967  IP_6426047        50.0    male     lower extremity   
 
         clin_size_long_diam_mm          image_type tbp_tile_type   tbp_lv

In [8]:
%%time

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Drop non-numeric features or convert them if needed
X = train_filled.drop(['target', 'patient_id', 'isic_id', 'attribution'], axis=1)
y = train_filled['target']

# Convert categorical variables to numeric
object_cols = X.select_dtypes(include=['object']).columns

label_encoders = {}
for col in object_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le
    
print(X.head())

   age_approx  sex  anatom_site_general  clin_size_long_diam_mm  image_type  \
0        60.0    2                    3                    3.04           0   
1        60.0    2                    2                    1.10           0   
2        60.0    2                    4                    3.40           0   
3        65.0    2                    1                    3.22           0   
4        55.0    2                    1                    2.73           0   

   tbp_tile_type   tbp_lv_A  tbp_lv_Aext   tbp_lv_B  tbp_lv_Bext  ...  \
0              1  20.244422    16.261975  26.922447    23.954773  ...   
1              1  31.712570    25.364740  26.331000    24.549290  ...   
2              0  22.575830    17.128170  37.970460    33.485410  ...   
3              0  14.242329    12.164757  21.448144    21.121356  ...   
4              1  24.725520    20.057470  26.464900    25.710460  ...   

   tbp_lv_perimeterMM  tbp_lv_radial_color_std_max  tbp_lv_stdL  \
0            9.3070

In [9]:
label_encoders

{'sex': LabelEncoder(),
 'anatom_site_general': LabelEncoder(),
 'image_type': LabelEncoder(),
 'tbp_tile_type': LabelEncoder(),
 'tbp_lv_location': LabelEncoder(),
 'tbp_lv_location_simple': LabelEncoder(),
 'copyright_license': LabelEncoder()}

In [10]:
%%time

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

CPU times: user 282 ms, sys: 20.9 ms, total: 303 ms
Wall time: 304 ms


In [11]:
X_train.shape

(280741, 41)

In [12]:
X_test.shape

(120318, 41)

In [13]:
%%time

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

CPU times: user 589 ms, sys: 209 ms, total: 798 ms
Wall time: 754 ms


In [14]:
X_train.shape

(280741, 41)

In [None]:
%%time

# Train a logistic regression model with class weight adjustment
model = LogisticRegression(class_weight='balanced', max_iter=2000)
model.fit(X_train_smote, y_train_smote)

X_train = X_train_smote
y_train = y_train_smote

# #Parameters found tuning process by Optuna
# xgb_params = {"n_estimators" : 100000,
#               "learning_rate" : 0.02,
#               'alpha': 3.5353386519926673e-07,
#               'subsample': 0.297482431044015,            
#               'colsample_bytree': 0.6687206013341591,
#               'max_depth': 6,
#               'min_child_weight': 3,
#               "enable_categorical" : True,
#               'gamma': 0.0006433825429109002}

# # Model pipeline with found hyperparameters
# model = XGBClassifier(**xgb_params, random_state=random_state)

model.fit(X_train, y_train, verbose=True)


In [None]:
%%time

# Make predictions and evaluate the model
# y_pred = model.predict(X_test)
y_pred = model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))

In [None]:
%%time

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# The confusion matrix shows:

True Positives (TP): 117 (Malignant cases correctly identified)

True Negatives (TN): 120,196 (Benign cases correctly identified)

False Positives (FP): 4 (Benign cases incorrectly identified as Malignant)

False Negatives (FN): 1 (Malignant case incorrectly identified as Benign)

In [None]:
%%time

from sklearn.metrics import roc_curve, auc
import numpy as np

# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])

# Find the index where TPR is just above 80%
threshold_index = np.where(tpr >= 0.8)[0][0]

# Calculate pAUC above 80% TPR
fpr_restricted = fpr[threshold_index:]
tpr_restricted = tpr[threshold_index:]
pAUC = auc(fpr_restricted, tpr_restricted)

print(f'Partial AUC above 80% TPR: {pAUC}')

In [None]:
label_encoders

In [None]:
%%time

# predict test set
X_test = test_filled.drop(['patient_id', 'isic_id', 'attribution'], axis=1)

# Convert categorical variables to numeric
# object_cols = X_test.select_dtypes(include=['object']).columns
X_test.head()

In [None]:
for col in object_cols:
    le = label_encoders[col]
    X_test[col] = le.transform(X_test[col].astype(str))

In [None]:
X_test.head()

In [None]:
# y_test_pred_proba = model.predict(X_test)
y_test_pred_proba = model.predict_proba(X_test)[:, 1]
                           
# Placeholder probabilities
# y_test_pred_proba = np.random.rand(len(X_test)) 

# The column 'target' should contain the predicted probabilities
sample_submission['target'] = y_test_pred_proba

sample_submission.head()

In [None]:
%%time

# Save the new submission file
submission_path = 'submission.csv'
sample_submission.to_csv(submission_path, index=False)