# Import Required Libraries

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


# Load Dataset

In [26]:
# Load dataset
df = pd.read_csv("dataset.csv")

print(df.head())
print(df.info())


   Age Gender Smoking Hx Smoking Hx Radiothreapy Thyroid Function  \
0   27      F      No         No              No        Euthyroid   
1   34      F      No        Yes              No        Euthyroid   
2   30      F      No         No              No        Euthyroid   
3   62      F      No         No              No        Euthyroid   
4   62      F      No         No              No        Euthyroid   

          Physical Examination Adenopathy       Pathology     Focality Risk  \
0   Single nodular goiter-left         No  Micropapillary    Uni-Focal  Low   
1          Multinodular goiter         No  Micropapillary    Uni-Focal  Low   
2  Single nodular goiter-right         No  Micropapillary    Uni-Focal  Low   
3  Single nodular goiter-right         No  Micropapillary    Uni-Focal  Low   
4          Multinodular goiter         No  Micropapillary  Multi-Focal  Low   

     T   N   M Stage       Response Recurred  
0  T1a  N0  M0     I  Indeterminate       No  
1  T1a  N0  M0  

# Handle Missing Values

In [27]:
# Fill missing numerical values with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing categorical values with mode
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])


# Encode Categorical Features
Medical datasets are mostly categorical, so encoding is essential.

In [28]:
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


# Select Features & Target

In [29]:
X = df.drop('Recurred', axis=1)
y = df['Recurred']


# Trainâ€“Test Split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Feature Scaling

In [31]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Train Machine Learning Model
(Random Forest works well for medical + mixed data)

In [32]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Model Evaluation

In [33]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ðŸ“Š Model Performance")
print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1 Score :", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


ðŸ“Š Model Performance
Accuracy : 0.961038961038961
Precision: 1.0
Recall   : 0.8636363636363636
F1 Score : 0.926829268292683

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        55
           1       1.00      0.86      0.93        22

    accuracy                           0.96        77
   macro avg       0.97      0.93      0.95        77
weighted avg       0.96      0.96      0.96        77



# Predict Recurrence for a New Patient

In [38]:
import numpy as np
import pandas as pd

# ---------- SAFE TRANSFORM FUNCTION ----------
def safe_label_transform(encoder, value):
    """
    If value not seen during training, assign most frequent class (index 0)
    """
    if value in encoder.classes_:
        return encoder.transform([value])[0]
    else:
        return 0  # fallback (most frequent class)

# ---------- NEW PATIENT DATA ----------
new_patient = {
    'Age': 45,
    'Gender': 'Female',
    'Smoking': 'No',
    'Hx Smoking': 'No',
    'Hx Radiotherapy': 'Yes',
    'Thyroid Function': 'Abnormal',
    'Physical Examination': 'Abnormal',
    'Adenopathy': 'Yes',
    'Pathology': 'Papillary',
    'Focality': 'Multifocal',
    'Risk': 'High',
    'T': 'T3',
    'N': 'N1',
    'M': 'M0',
    'Stage': 'Stage III',
    'Response': 'Incomplete'
}

# ---------- ENCODE USING TRAINED LABEL ENCODERS ----------
encoded_patient = {}

for col, value in new_patient.items():
    if col in label_encoders:
        encoded_patient[col] = safe_label_transform(label_encoders[col], value)
    else:
        encoded_patient[col] = value

# Convert to DataFrame
new_patient_df = pd.DataFrame([encoded_patient])

# ---------- SCALE ----------
new_patient_scaled = scaler.transform(new_patient_df)

# ---------- PREDICT ----------
prediction = model.predict(new_patient_scaled)[0]
probability = model.predict_proba(new_patient_scaled)[0][1]

# ---------- OUTPUT ----------
print("Prediction:", "Recurrence Likely" if prediction == 1 else "Low Risk")
print("Recurrence Probability:", round(probability * 100, 2), "%")


<class 'ValueError'>: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Hx Radiotherapy
Feature names seen at fit time, yet now missing:
- Hx Radiothreapy
