# Klasifikasi pasien dengan penyakit Giloma (Tumor Otak) LGG (Glioma Tingkat Rendah) atau GBM (Glioblastoma Multiforme)


In [None]:
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
import sklearn.model_selection as ms
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# model knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
# model random forest
from sklearn.ensemble import RandomForestClassifier




In [None]:
cgga_df=pd.read_csv("TCGA_InfoWithGrade.csv")
cgga_df.info()

In [None]:
cgga_df.sample(n=10)

## Preprocesing

In [None]:
cgga_df.isna().sum()

In [None]:
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

normalisasi = cgga_df.copy()

normalisasi['Age_at_diagnosis'] = min_max_normalize(cgga_df['Age_at_diagnosis'])

print("Data setelah normalisasi :")
normalisasi


In [None]:
x = normalisasi.drop('Grade', axis=1)
numerical_features = x.select_dtypes(include=['int64', 'float64'])
categorical_features = x.select_dtypes(include=['object'])
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.15)

y_outlier = lof.fit_predict(normalisasi)

anomaly_scores = lof.negative_outlier_factor_

normalisasi['LOF_Prediksi'] = y_outlier
normalisasi['LOF_Skor_Anomali'] = anomaly_scores
data_bersih = normalisasi[normalisasi['LOF_Prediksi'] != -1]
data_bersih = data_bersih.drop(['LOF_Prediksi', 'LOF_Skor_Anomali'], axis=1)

data_bersih


## Split Data

In [None]:
X= data_bersih[['Age_at_diagnosis', 'Gender', 'Race','IDH1','TP53','ATRX','PTEN','EGFR',
                'CIC','MUC16','PIK3CA','NF1','PIK3R1','FUBP1','RB1','NOTCH1','BCOR','CSMD3',
                'SMARCA4','GRIN2A','IDH2','FAT4','PDGFRA']]
y= data_bersih.Grade
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.20, random_state=0)

display("Data Train X", X_train)
display("Data Train y", y_train.to_frame())
display("Data Test X", X_test)
display("Data test y", y_test.to_frame())


### Modelling logistik regression epoch 50

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict(X, weights):
    return sigmoid(np.dot(X, weights))

def predict_labels(X, weights, threshold=0.5):
    probabilities = predict(X, weights)
    return (probabilities >= threshold).astype(int)

def update_weights(X, y, weights, learning_rate):
    for i in range(len(y)):
        X_i = X[i].reshape(-1)
        y_pred = predict(X_i, weights)
        error = y_pred - y[i]
        weights -= learning_rate * error * X_i
    return weights

def logistic_regression_sgd(X, y, learning_rate, epochs):
    np.random.seed(42)
    weights = np.random.rand(X.shape[1]) * 0.01

    for epoch in range(epochs):
        weights = update_weights(X, y, weights, learning_rate)
        accuracy = calculate_accuracy(X, y, weights)
        # print(f"Epoch {epoch + 1}/{epochs}, Accuracy: {accuracy * 100:.2f}%, Weights: {weights}")

    return weights

def calculate_accuracy(X, y, weights):
    y_pred = predict_labels(X, weights)
    accuracy = np.mean(y_pred == y)
    return accuracy

X_train = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test = np.c_[np.ones((X_test.shape[0], 1)), X_test]
y_train = np.ravel(np.array(y_train))
y_test = np.ravel(np.array(y_test))

learning_rate50 = 0.01
epochs50 = 50
weights = logistic_regression_sgd(X_train, y_train, learning_rate50, epochs50)

In [None]:
y_pred_test = predict_labels(X_test, weights)
conf_matrix = confusion_matrix(y_test, y_pred_test)


train_accuracy50 = calculate_accuracy(X_train, y_train, weights)

label_mapping = {0: "LGG", 1: "GBM"}
y_test_mapped = pd.Series(y_test).map(label_mapping)
y_pred_test_mapped = pd.Series(y_pred_test).map(label_mapping)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['LGG', 'GBM'], yticklabels=['LGG', 'GBM'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

report = classification_report(y_test, y_pred_test, target_names=['LGG', 'GBM'], output_dict=True)
classification_df = pd.DataFrame(report).transpose()
print("\nClassification Report:")
display(classification_df)  # Menampilkan classification report dalam bentuk tabel

# Akurasi model pada data training
print("\nAkurasi model pada data training:")
print(f"{train_accuracy50 * 100:.2f}%")

# Menampilkan 10 sampel data klasifikasi dalam tabel
print("\n10 Sampel Data Klasifikasi:")
classification_results = pd.DataFrame({
    'True Label': y_test_mapped,
    'Predicted Label': y_pred_test_mapped
})

# Ambil 10 data secara acak dan tampilkan sebagai tabel
sample_results = classification_results.sample(10)
display(sample_results)  # Menampilkan tabel hasil 10 sampel


## Modelling Random Forest

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Inisialisasi dan pelatihan model RandomForest
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    max_features=10
)
model.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred_randomforest = model.predict(X_test)
score_randomforest = model.score(X_test, y_test)
conf_matrix_randomforest = confusion_matrix(y_test, y_pred_randomforest)

# Ganti label 0 dengan "LGG" dan 1 dengan "GBM"
label_mapping = {0: "LGG", 1: "GBM"}
y_test_mapped = pd.Series(y_test).map(label_mapping)
y_pred_randomforest_mapped = pd.Series(y_pred_randomforest).map(label_mapping)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_randomforest, annot=True, fmt='d', cmap='Blues', xticklabels=['LGG', 'GBM'], yticklabels=['LGG', 'GBM'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix (Random Forest)')
plt.show()

# Classification report
report_randomforest = classification_report(y_test, y_pred_randomforest, target_names=['LGG', 'GBM'], output_dict=True)
classification_randomforest_df = pd.DataFrame(report_randomforest).transpose()
print("\nClassification Report:")
display(classification_randomforest_df)  # Menampilkan classification report dalam bentuk tabel

# Akurasi model pada data testing
print("\nAkurasi model pada data testing:")
print(f"{score_randomforest * 100:.2f}%")

# Menampilkan 10 sampel data klasifikasi dalam tabel
print("\n10 Sampel Data Klasifikasi:")
classification_randomforest_results = pd.DataFrame({
    'True Label': y_test_mapped,
    'Predicted Label': y_pred_randomforest_mapped
})

# Ambil 10 data secara acak dan tampilkan sebagai tabel
sample_randomforest_results = classification_randomforest_results.sample(10)
display(sample_randomforest_results)  # Menampilkan tabel hasil 10 sampel


## Modelling KNN

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Inisialisasi dan pelatihan model KNN
model_knn = KNeighborsClassifier(n_neighbors=10)
model_knn.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred_knn = model_knn.predict(X_test)
score_knn = accuracy_score(y_test, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)

# Ganti label 0 dengan "LGG" dan 1 dengan "GBM"
label_mapping = {0: "LGG", 1: "GBM"}
y_test_mapped = pd.Series(y_test).map(label_mapping)
y_pred_knn_mapped = pd.Series(y_pred_knn).map(label_mapping)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_knn, annot=True, fmt='d', cmap='Blues', xticklabels=['LGG', 'GBM'], yticklabels=['LGG', 'GBM'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix (KNN)')
plt.show()

# Classification report
report_knn = classification_report(y_test, y_pred_knn, target_names=['LGG', 'GBM'], output_dict=True)
classification_knn_df = pd.DataFrame(report_knn).transpose()
print("\nClassification Report:")
display(classification_knn_df)  # Menampilkan classification report dalam bentuk tabel

# Akurasi model pada data testing
print("\nAkurasi model pada data testing:")
print(f"{score_knn * 100:.2f}%")

# Menampilkan 10 sampel data klasifikasi dalam tabel
print("\n10 Sampel Data Klasifikasi:")
classification_knn_results = pd.DataFrame({
    'True Label': y_test_mapped,
    'Predicted Label': y_pred_knn_mapped
})

# Ambil 10 data secara acak dan tampilkan sebagai tabel
sample_knn_results = classification_knn_results.sample(10)
display(sample_knn_results)  # Menampilkan tabel hasil 10 sampel


## Compare 3 Model

In [None]:
models = pd.DataFrame({
    'Model' : [ 'Logistic Regression', 'Random Forest Classifier','KNN Classifier'],
    'Score' : [train_accuracy50, score_randomforest, score_knn]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
import pickle

In [None]:
filename = 'model_klasifikasi_glioma.sav'
pickle.dump(model, open(filename, 'wb'))