In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report

df = pd.read_csv("../../data/data/train-metadata.csv", low_memory=False)
missing_percentages = (df.isnull().sum() / len(df)) * 100
columns_to_keep = missing_percentages[missing_percentages < 50].index
skin_cancer_df = df[columns_to_keep]
skin_cancer_df = skin_cancer_df.fillna(skin_cancer_df.mode().iloc[0])


  df = pd.read_csv("../../data/data/train-metadata.csv")


In [None]:

# Encode categorical variables
categorical_columns = skin_cancer_df.select_dtypes(include=['object']).columns

# Apply one-hot encoding
skin_cancer_df = pd.get_dummies(skin_cancer_df, columns=categorical_columns, drop_first=True)
X = skin_cancer_df.drop('target', axis=1)
y = skin_cancer_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

Accuracy: 1.0

Top 10 Most Important Features:
                         feature  importance
45                        iddx_1    0.341921
44                     iddx_full    0.301331
14                      tbp_lv_H    0.061557
46  tbp_lv_dnn_lesion_confidence    0.046883
18                tbp_lv_areaMM2    0.041050
43             copyright_license    0.026922
29            tbp_lv_minorAxisMM    0.024551
5         clin_size_long_diam_mm    0.018127
15                   tbp_lv_Hext    0.015758
22                 tbp_lv_deltaB    0.013479

Confusion Matrix:
[[80138     0]
 [    0    74]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     80138
           1       1.00      1.00      1.00        74

    accuracy                           1.00     80212
   macro avg       1.00      1.00      1.00     80212
weighted avg       1.00      1.00      1.00     80212



In [None]:

rf = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42)
rf.fit(X_train_sm, y_train_sm)

accuracy = rf.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

y_pred = rf.predict(X_test) 
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))