In [40]:
import numpy as np
import pandas as pd
import joblib
from IPython.display import display, clear_output
import ipywidgets as widgets
from sklearn.preprocessing import LabelEncoder

In [41]:
# Load the trained model
rf = joblib.load('best_random_forest.pkl')

# Load the dataset to extract means for non-editable features
df = pd.read_csv("cosmic_clinvar_processed.tsv", sep="\t")

df1 = pd.read_csv("cosmic_clinvar.tsv", sep="\t")

  df1 = pd.read_csv("cosmic_clinvar.tsv", sep="\t")


In [42]:
mutation_type_options = [
    '5 prime UTR variant', 'frameshift variant', 'inframe deletion', 'inframe insertion',
    'missense variant', 'protein altering variant', 'splice acceptor variant', 'splice donor variant',
    'splice region variant', 'start lost', 'stop gained', 'stop lost', 'stop retained variant',
    'synonymous variant'
]

somatic_status_options = [
    'Not specified',
    'Reported in another cancer sample as somatic',
    'Variant of unknown origin'
]

gene_list = list(df1["GENE_SYMBOL"].unique())

In [43]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(df1["GENE_SYMBOL"])

array([258, 258, 258, ...,  19,  19,  14])

In [44]:
# Get min/max for numeric features
def get_minmax(col):
    return df[col].min(), df[col].max()

# Create numeric sliders / boxes
def create_numeric_input(name):
    min_val, max_val = get_minmax(name)
    return widgets.BoundedFloatText(
        value=(min_val + max_val) / 2,
        min=min_val,
        max=max_val,
        step=1.0,
        description=name,
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='50%')
    )

AA_Length_Change = create_numeric_input("AA_Length_Change")
GENOME_START = create_numeric_input("GENOME_START")
GENOME_STOP = create_numeric_input("GENOME_STOP")
Gene_Encoded = create_numeric_input("Gene_Encoded")
Hydrophobicity_Change = create_numeric_input("Hydrophobicity_Change")
BLOSUM62_Score = create_numeric_input("BLOSUM62_Score")
Charge_Change = create_numeric_input("Charge_Change")
CHROMOSOME = create_numeric_input("CHROMOSOME")

mutation_type = widgets.Dropdown(
    options=mutation_type_options,
    description='Mutation Type:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

somatic_status = widgets.Dropdown(
    options=somatic_status_options,
    description='Somatic Status:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

gene_input = widgets.Text(
    value='LZTR1',
    placeholder='Enter gene symbol (e.g., LZTR1)',
    description='Gene:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)


In [45]:
predict_button = widgets.Button(
    description='Predict Mutation Pathogenicity',
    button_style='success',
    layout=widgets.Layout(width='50%')
)

output_box = widgets.Output()

def on_predict_clicked(b):
    with output_box:
        clear_output()

        # Initialize with medians
        row = df.median().copy()

        # Fill in user inputs
        row['AA_Length_Change'] = AA_Length_Change.value
        row['GENOME_START'] = GENOME_START.value
        row['GENOME_STOP'] = GENOME_STOP.value
        gene_name = gene_input.value.strip().upper()
        if gene_name not in gene_list:
            print(f"⚠️ '{gene_name}' is not a recognized gene. Please try again.")
            return
        row['Gene_Encoded'] = label_encoder.transform([gene_name])[0]
        row['Hydrophobicity_Change'] = Hydrophobicity_Change.value
        row['BLOSUM62_Score'] = BLOSUM62_Score.value
        row['Charge_Change'] = Charge_Change.value
        row['CHROMOSOME'] = CHROMOSOME.value

        # One-hot encode mutation type
        for cat in mutation_type_options:
            row[cat] = 1.0 if cat == mutation_type.value else 0.0

        # One-hot encode somatic status
        for status in somatic_status_options:
            col = f"MUTATION_SOMATIC_STATUS_{status}"
            if col in row:
                row[col] = 1.0 if status == somatic_status.value else 0.0

        # Select only the model input columns
        model_input = row[df.drop(columns='Label').columns].values.reshape(1, -1)

        # Predict
        prediction = rf.predict(model_input)[0]
        prob = rf.predict_proba(model_input)[0]

        label = "Pathogenic" if prediction == 1 else "Benign"
        confidence = np.max(prob) * 100

        print(f"🧬 Prediction: {label}")
        print(f"📊 Confidence: {confidence:.2f}%")

predict_button.on_click(on_predict_clicked)

In [47]:
form_items = widgets.VBox([
    mutation_type,
    somatic_status,
    CHROMOSOME,
    gene_input,
    AA_Length_Change,
    GENOME_START,
    GENOME_STOP,
    Hydrophobicity_Change,
    BLOSUM62_Score,
    Charge_Change,
    predict_button,
    output_box
])

display(form_items)

VBox(children=(Dropdown(description='Mutation Type:', index=4, layout=Layout(width='50%'), options=('5 prime U…

🧬 Prediction: Benign
📊 Confidence: 51.80%
