In [1]:
#data handling
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler

#classification
from sklearn.ensemble import RandomForestClassifier

# performance metrics
from sklearn.metrics import f1_score,precision_score, recall_score,make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import roc_auc_score

#Pipeline
from sklearn.pipeline import Pipeline

#Model Selection
from sklearn.model_selection import RandomizedSearchCV

# Decomposition
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('data.csv')
labels = pd.read_csv('labels.csv')

# Rename columns to ensure the keys match
data.rename(columns={data.columns[0]: "sample_id"}, inplace=True)
labels.rename(columns={labels.columns[0]: "sample_id"}, inplace=True)

# Merge on sample_id
cancer_data = pd.merge(data, labels, on='sample_id')

# View result
print(cancer_data.head())

  sample_id  gene_0    gene_1    gene_2    gene_3     gene_4  gene_5  \
0  sample_0     0.0  2.017209  3.265527  5.478487  10.431999     0.0   
1  sample_1     0.0  0.592732  1.588421  7.586157   9.623011     0.0   
2  sample_2     0.0  3.511759  4.327199  6.881787   9.870730     0.0   
3  sample_3     0.0  3.663618  4.507649  6.659068  10.196184     0.0   
4  sample_4     0.0  2.655741  2.821547  6.539454   9.738265     0.0   

     gene_6    gene_7  gene_8  ...  gene_20522  gene_20523  gene_20524  \
0  7.175175  0.591871     0.0  ...    8.210257    9.723516    7.220030   
1  6.816049  0.000000     0.0  ...    7.323865    9.740931    6.256586   
2  6.972130  0.452595     0.0  ...    8.127123   10.908640    5.401607   
3  7.843375  0.434882     0.0  ...    8.792959   10.141520    8.942805   
4  6.566967  0.360982     0.0  ...    8.891425   10.373790    7.181162   

   gene_20525  gene_20526  gene_20527  gene_20528  gene_20529  gene_20530  \
0    9.119813   12.003135    9.650743    8.92

In [3]:
# Step 3: Feature and Target Separation
X = cancer_data.drop(['sample_id', 'Class'], axis=1)
y = cancer_data['Class']

# Step 4: Label Encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 5: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [6]:
# Step 7: Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [10]:
def accuracy_score(y_true, y_pred):
    """
    Computes the accuracy of predictions.

    Args:
        y_true (array-like): True class labels.
        y_pred (array-like): Predicted class labels.

    Returns:
        float: Accuracy (0.0 to 1.0)
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    correct = np.sum(y_true == y_pred)
    total = len(y_true)

    return correct / total

In [11]:
# Step 8: Evaluation
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.4f}")
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

✅ Accuracy: 1.0000

📊 Classification Report:
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        61
        COAD       1.00      1.00      1.00        17
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        29
        PRAD       1.00      1.00      1.00        29

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161



In [12]:
import pickle
# Save model to file
with open("rf_model.pkl", "wb") as file:
    pickle.dump(model, file)


In [None]:
# Load with pickle
with open("rf_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

# OR load with joblib
# loaded_model = joblib.load("rf_model.pkl")
