In [None]:
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
# --- Data Loading ---
# Using the provided CSV data as a string
df = pd.read_csv("/content/Ovarian Cyst Track Data.csv")
df.head()

Unnamed: 0,Patient ID,Age,Menopause Status,Cyst Size cm,Cyst Growth Rate cm/month,CA 125 Level,Ultrasound Features,Reported Symptoms,Recommended Management,Date of Exam,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,OC-1000,52,Post-menopausal,3.2,-0.34,19,Septated cyst,"Pelvic pain, Nausea, Bloating",Observation,2025-05-23,...,,,,,,,,,,
1,OC-1001,62,Post-menopausal,7.9,0.84,111,Septated cyst,Bloating,Medication,2025-04-27,...,,,,,,,,,,
2,OC-1002,59,Post-menopausal,2.2,0.5,123,Hemorrhagic cyst,"Pelvic pain, Irregular periods, Bloating",Referral,2025-01-05,...,,,,,,,,,,
3,OC-1003,64,Post-menopausal,5.5,1.11,116,Solid mass,"Nausea, Irregular periods",Medication,2025-03-11,...,,,,,,,,,,
4,OC-1004,43,Post-menopausal,4.0,0.9,98,Hemorrhagic cyst,"Irregular periods, Bloating, Nausea",Surgery,2025-02-17,...,,,,,,,,,,


In [None]:
# --- Data Preprocessing ---
# Drop completely empty columns that might have been created due to trailing commas
df.dropna(axis=1, how='all', inplace=True)

In [None]:
# Select features and target as specified by the problem description
feature_cols_spec = ['Age', 'Menopause Status', 'Cyst Size cm', 'Cyst Growth Rate cm/month',
                     'CA 125 Level', 'Ultrasound Features', 'Reported Symptoms']
target_col_spec = 'Recommended Management'

In [None]:
# Keep only necessary columns for processing
df_processed = df[feature_cols_spec + [target_col_spec]].copy()
df_processed.head(2)

Unnamed: 0,Age,Menopause Status,Cyst Size cm,Cyst Growth Rate cm/month,CA 125 Level,Ultrasound Features,Reported Symptoms,Recommended Management
0,52,Post-menopausal,3.2,-0.34,19,Septated cyst,"Pelvic pain, Nausea, Bloating",Observation
1,62,Post-menopausal,7.9,0.84,111,Septated cyst,Bloating,Medication


In [None]:
# 1. Handle 'Menopause Status'
# Encode: Pre = 0, Post = 1
df_processed['Menopause Status'] = df_processed['Menopause Status'].map({'Pre-menopausal': 0, 'Post-menopausal': 1})

In [None]:
# 2. Handle 'Reported Symptoms' (NLP-extracted flags)
# Fill NaN with empty string to avoid errors in split
df_processed['Reported Symptoms'] = df_processed['Reported Symptoms'].fillna('')
# Split symptoms, strip whitespace, convert to lowercase, and filter out empty strings
symptoms_list = df_processed['Reported Symptoms'].apply(
    lambda x: [s.strip().lower() for s in x.split(',') if s.strip()]
)

In [None]:
# Use MultiLabelBinarizer for symptoms
mlb = MultiLabelBinarizer()
symptoms_encoded = mlb.fit_transform(symptoms_list)
# Create meaningful column names for symptoms
symptom_column_names = [f"Symptom_{s.replace(' ', '_')}" for s in mlb.classes_]
symptoms_df = pd.DataFrame(symptoms_encoded, columns=symptom_column_names, index=df_processed.index)

In [None]:
# Concatenate symptom features with the main DataFrame
df_processed = pd.concat([df_processed, symptoms_df], axis=1)
df_processed.drop('Reported Symptoms', axis=1, inplace=True) # Drop original symptoms column

In [None]:
# 3. Handle 'Ultrasound Features' (Categorical) using One-Hot Encoding
# Create meaningful prefix for ultrasound features
df_processed = pd.get_dummies(df_processed, columns=['Ultrasound Features'], prefix='US')

In [None]:
df_processed.shape

(100, 16)

In [None]:
# Define X (features) and y (target)
X = df_processed.drop(target_col_spec, axis=1)
y = df_processed[target_col_spec]

# Ensure all column names are strings (can prevent issues with some libraries/versions)
X.columns = X.columns.astype(str)

In [None]:
# --- Split Data ---
# Stratify by y to ensure similar class distribution in train and test sets, important for small datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# --- Model Training (Random Forest with a Preprocessing Pipeline) ---

# Identify numerical features for scaling
numerical_features = ['Age', 'Cyst Size cm', 'Cyst Growth Rate cm/month', 'CA 125 Level']

In [None]:
# Create a preprocessor using ColumnTransformer
# Numerical features will be scaled. Other features (already encoded) will be passed through.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough' # Keep other columns as they are (already preprocessed)
)

In [None]:
# Create the full pipeline: preprocess, then classify
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42)) # Using RandomForest as a robust choice
])

# Train the model pipeline
model_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# --- Model Evaluation ---
y_pred = model_pipeline.predict(X_test)
y_pred[:5]

array(['Observation', 'Observation', 'Medication', 'Observation',
       'Referral'], dtype=object)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=0) # zero_division added for robustness

print(f"Model Accuracy: {accuracy:.4f}\n")
print("Classification Report:\n", class_report)

Model Accuracy: 0.1000

Classification Report:
               precision    recall  f1-score   support

  Medication       0.14      0.17      0.15         6
 Observation       0.00      0.00      0.00         5
    Referral       0.00      0.00      0.00         4
     Surgery       0.25      0.20      0.22         5

    accuracy                           0.10        20
   macro avg       0.10      0.09      0.09        20
weighted avg       0.11      0.10      0.10        20



In [None]:
# --- Feature Importances (Optional but informative) ---
# Extract the trained classifier from the pipeline
rf_classifier = model_pipeline.named_steps['classifier']
importances = rf_classifier.feature_importances_

In [None]:
# Get feature names in the order they are seen by the classifier (after preprocessor)
# Numerical features (scaled) come first, in the order specified in 'numerical_features'
# Remainder features (passthrough) come next, in their order from X_train.columns
passthrough_cols_in_X_train = [col for col in X_train.columns if col not in numerical_features]
final_ordered_feature_names = numerical_features + passthrough_cols_in_X_train

if len(final_ordered_feature_names) == len(importances):
    feature_importance_df = pd.DataFrame({'Feature': final_ordered_feature_names, 'Importance': importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    print("\nFeature Importances (Top 10):\n", feature_importance_df.head(10))
else:
    print("\nWarning: Mismatch in the number of feature names and importances.")
    print(f"Number of features expected by classifier: {len(importances)}")
    print(f"Number of reconstructed feature names: {len(final_ordered_feature_names)}")
    print("Feature importances might not be correctly aligned with names.")


Feature Importances (Top 10):
                      Feature  Importance
2  Cyst Growth Rate cm/month    0.161932
1               Cyst Size cm    0.154926
0                        Age    0.147169
3               CA 125 Level    0.138656
6            Symptom_fatigue    0.051935
9        Symptom_pelvic_pain    0.049863
7  Symptom_irregular_periods    0.042908
8             Symptom_nausea    0.038896
4           Menopause Status    0.036860
5           Symptom_bloating    0.035840


## Model Evaluation

### Key Reasons for Poor Performance & Solutions:
1. Extremely Small Dataset:
    * Problem: You have only 100 samples. Machine learning models, especially complex ones like Random Forest, generally require significantly more data to learn robust patterns. With so few samples, the model is likely overfitting to the training data (memorizing it) and not generalizing to the test data. The train_test_split of 80/20 means your test set is only 20 samples, making evaluation very noisy.
    * Solution:
        * Get More Data: This is the most impactful solution if feasible.
    * Use Cross-Validation: Instead of a single train/test split, use k-fold cross-validation (e.g., StratifiedKFold) for a more robust estimate of performance, especially during hyperparameter tuning. For final evaluation, you might still hold out a test set, but for model selection, CV is better with small data.
    * Data Augmentation (Advanced/Careful): For tabular data, this is harder than for images. Techniques like SMOTE (Synthetic Minority Over-sampling Technique) can help, especially if class imbalance is also an issue.

2. Class Imbalance:
    * Problem: Looking at our classification report's support column (counts in the test set: Medication 6, Observation 5, Referral 4, Surgery 5), the classes are somewhat imbalanced, especially "Referral" being the least frequent. The model might be biased towards predicting more frequent classes or failing to learn the less frequent ones.
    * Solution:
        * class_weight='balanced': Many scikit-learn classifiers (including RandomForestClassifier) have this parameter.

### Class Imbalance

In [None]:
model_pipeline_balanced = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

In [None]:
# Train the model pipeline
model_pipeline_balanced.fit(X_train, y_train)
# --- Model Evaluation ---
y_pred_balanced = model_pipeline_balanced.predict(X_test)


accuracy_balanced = accuracy_score(y_test, y_pred_balanced)
class_report_balanced = classification_report(y_test, y_pred_balanced, zero_division=0) # zero_division added for robustness

print(f"Model Accuracy: {accuracy_balanced:.4f}\n")
print("Classification Report:\n", class_report_balanced)

Model Accuracy: 0.1000

Classification Report:
               precision    recall  f1-score   support

  Medication       0.20      0.33      0.25         6
 Observation       0.00      0.00      0.00         5
    Referral       0.00      0.00      0.00         4
     Surgery       0.00      0.00      0.00         5

    accuracy                           0.10        20
   macro avg       0.05      0.08      0.06        20
weighted avg       0.06      0.10      0.07        20



### Model

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define parameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__class_weight': [None, 'balanced'] # Add class_weight here
}

# Use StratifiedKFold for cross-validation
cv_stratified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 5 splits might be too many for 80 training samples. Consider 3 or 4.

grid_search = GridSearchCV(model_pipeline, param_grid, cv=cv_stratified, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=0)
print(f"Model Accuracy (after GridSearchCV): {accuracy:.4f}\n")
print("Classification Report (after GridSearchCV):\n", class_report)

Best parameters found:  {'classifier__class_weight': None, 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Model Accuracy (after GridSearchCV): 0.1000

Classification Report (after GridSearchCV):
               precision    recall  f1-score   support

  Medication       0.12      0.17      0.14         6
 Observation       0.12      0.20      0.15         5
    Referral       0.00      0.00      0.00         4
     Surgery       0.00      0.00      0.00         5

    accuracy                           0.10        20
   macro avg       0.06      0.09      0.07        20
weighted avg       0.07      0.10      0.08        20



In [None]:
# --- Feature Importances (Optional but informative) ---
# Extract the trained classifier from the pipeline
rf_classifier = model_pipeline.named_steps['classifier']
importances = rf_classifier.feature_importances_

In [None]:
# Get feature names in the order they are seen by the classifier (after preprocessor)
# Numerical features (scaled) come first, in the order specified in 'numerical_features'
# Remainder features (passthrough) come next, in their order from X_train.columns
passthrough_cols_in_X_train = [col for col in X_train.columns if col not in numerical_features]
final_ordered_feature_names = numerical_features + passthrough_cols_in_X_train

if len(final_ordered_feature_names) == len(importances):
    feature_importance_df = pd.DataFrame({'Feature': final_ordered_feature_names, 'Importance': importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    print("\nFeature Importances (Top 10):\n", feature_importance_df.head(10))
else:
    print("\nWarning: Mismatch in the number of feature names and importances.")
    print(f"Number of features expected by classifier: {len(importances)}")
    print(f"Number of reconstructed feature names: {len(final_ordered_feature_names)}")
    print("Feature importances might not be correctly aligned with names.")

## Model Inference

In [None]:
import joblib # For saving and loading model
df = pd.read_csv("/content/Ovarian Cyst Track Data.csv")
df.dropna(axis=1, how='all', inplace=True)

# --- Data Preprocessing ---
feature_cols_spec = ['Age', 'Menopause Status', 'Cyst Size cm', 'Cyst Growth Rate cm/month',
                     'CA 125 Level', 'Ultrasound Features', 'Reported Symptoms']
target_col_spec = 'Recommended Management'
df_processed = df[feature_cols_spec + [target_col_spec]].copy()

df_processed['Menopause Status'] = df_processed['Menopause Status'].map({'Pre-menopausal': 0, 'Post-menopausal': 1})
df_processed['Reported Symptoms'] = df_processed['Reported Symptoms'].fillna('')
symptoms_list = df_processed['Reported Symptoms'].apply(
    lambda x: [s.strip().lower() for s in x.split(',') if s.strip()]
)

mlb = MultiLabelBinarizer()
symptoms_encoded = mlb.fit_transform(symptoms_list)
symptom_column_names = [f"Symptom_{s.replace(' ', '_')}" for s in mlb.classes_]
symptoms_df = pd.DataFrame(symptoms_encoded, columns=symptom_column_names, index=df_processed.index)
df_processed = pd.concat([df_processed, symptoms_df], axis=1)
df_processed.drop('Reported Symptoms', axis=1, inplace=True)

# Capture ultrasound feature categories before dummification for inference
unique_ultrasound_features = df_processed['Ultrasound Features'].unique()
df_processed = pd.get_dummies(df_processed, columns=['Ultrasound Features'], prefix='US')
ultrasound_one_hot_columns = [col for col in df_processed.columns if col.startswith('US_')]


X = df_processed.drop(target_col_spec, axis=1)
y = df_processed[target_col_spec]
X.columns = X.columns.astype(str) # Ensure string column names

# Capture the full list of columns X will have before passing to the pipeline
# This is crucial for inference to recreate the same structure
# These are the columns AFTER manual one-hot encoding for symptoms and ultrasound
# but BEFORE the ColumnTransformer in the pipeline acts.
pipeline_input_columns = X.columns.tolist()


# --- Split Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Model Training (Random Forest with Preprocessing Pipeline and GridSearchCV) ---
numerical_features = ['Age', 'Cyst Size cm', 'Cyst Growth Rate cm/month', 'CA 125 Level']

preprocessor_ct = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

# Full pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_ct),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter Tuning with GridSearchCV
# Note: For very small datasets, extensive grid search can still overfit.
# Consider a smaller grid or fewer CV folds if results are unstable.
param_grid = {
    'classifier__n_estimators': [50, 100], # Reduced for small dataset
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 3],
    'classifier__class_weight': ['balanced', None] # Crucial for imbalance
}

# Use StratifiedKFold. For 80 training samples, 3 or 4 splits is more robust than 5.
cv_stratified = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid_search = GridSearchCV(model_pipeline, param_grid, cv=cv_stratified, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)

# --- Model Evaluation (on test set with the best model) ---
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
class_report_best = classification_report(y_test, y_pred_best, zero_division=0)

print(f"\nBest Model Accuracy on Test Set: {accuracy_best:.4f}\n")
print("Best Model Classification Report on Test Set:\n", class_report_best)

# --- Save the Model and Necessary Preprocessing Objects ---
model_filename = 'ovarian_cyst_management_model.joblib'
mlb_filename = 'symptom_mlb_encoder.joblib'
# Other necessary info for inference:
# - pipeline_input_columns (list of columns expected by the pipeline's input)
# - symptom_column_names (generated by mlb)
# - ultrasound_one_hot_columns (generated by pd.get_dummies)
# - numerical_features (used by StandardScaler within the pipeline)

inference_artifacts = {
    'model': best_model,
    'mlb_encoder': mlb,
    'symptom_column_names': symptom_column_names, # From fitted mlb
    'ultrasound_one_hot_columns': ultrasound_one_hot_columns, # Columns created by get_dummies
    'pipeline_input_columns': pipeline_input_columns, # Full ordered list of columns for pipeline input
    'numerical_features': numerical_features # Just for reference, pipeline handles this
}

joblib.dump(inference_artifacts, 'ovarian_cyst_inference_artifacts.joblib')
print(f"\nModel and preprocessing artifacts saved to ovarian_cyst_inference_artifacts.joblib")

# --- Example of printing feature importances from the best model ---
# Ensure the classifier step in the best_model pipeline is named 'classifier'
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    importances = best_model.named_steps['classifier'].feature_importances_

    # Get feature names after ColumnTransformer
    # For ColumnTransformer, feature names can be tricky.
    # 'num' transformer outputs scaled numerical features.
    # 'remainder="passthrough"' outputs the remaining columns in their original order.

    # Reconstruct the feature names as seen by the classifier
    transformed_feature_names = []
    # Numerical features come first, scaled
    transformed_feature_names.extend(numerical_features)

    # Then passthrough features
    passthrough_cols = [col for col in pipeline_input_columns if col not in numerical_features]
    transformed_feature_names.extend(passthrough_cols)

    if len(transformed_feature_names) == len(importances):
        feature_importance_df = pd.DataFrame({'Feature': transformed_feature_names, 'Importance': importances})
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        print("\nFeature Importances (Best Model - Top 10):\n", feature_importance_df.head(10))
    else:
        print("\nWarning: Mismatch in the number of feature names and importances for the best model.")
        print(f"Num transformed_feature_names: {len(transformed_feature_names)}, Num importances: {len(importances)}")
else:
    print("\nThe selected best model's classifier does not have feature_importances_ attribute.")

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best parameters found:  {'classifier__class_weight': 'balanced', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}

Best Model Accuracy on Test Set: 0.1500

Best Model Classification Report on Test Set:
               precision    recall  f1-score   support

  Medication       0.14      0.17      0.15         6
 Observation       0.11      0.20      0.14         5
    Referral       0.00      0.00      0.00         4
     Surgery       0.50      0.20      0.29         5

    accuracy                           0.15        20
   macro avg       0.19      0.14      0.15        20
weighted avg       0.20      0.15      0.15        20


Model and preprocessing artifacts saved to ovarian_cyst_inference_artifacts.joblib

Feature Importances (Best Model - Top 10):
                       Feature  Importance
2   Cyst Growth Rate cm/month    0.172838
0    