<h1 style="color: slateblue; font-weight: bold;">Loading Datasets</h1>

In [1]:
import pandas as pd

# Load the datasets
dataset = pd.read_csv('GSE19728-new.csv')
de_genes = pd.read_csv('GSE19728.top.table.csv')

In [2]:
dataset.head()

Unnamed: 0,Genes,Gene Symbol,"normal tissue,biological rep1","astrocytomas at T1, biological rep1","astrocytomas at T1, biological rep2","astrocytomas at T2, biological rep1","astrocytomas at T2, biological rep2","astrocytomas at T2, biological rep3","astrocytomas at T2, biological rep4","astrocytomas at T2, biological rep5",...,"astrocytomas at T3, biological rep4","astrocytomas at T3, biological rep5","astrocytomas at T4, biological rep1","astrocytomas at T4, biological rep2","astrocytomas at T4, biological rep3","astrocytomas at T4, biological rep4","astrocytomas at T4, biological rep5","normal tissue,biological rep2","normal tissue,biological rep3","normal tissue,biological rep4"
0,1007_s_at,F2R,3303.43,2332.07,3711.45,9253.7,9359.25,7400.0,11298.5,8025.2,...,4301.57,9674.64,8324.0,8352.96,12445.7,5208.74,8228.82,1991.01,2510.57,3946.86
1,1053_at,PCDHGA4,338.685,64.0695,325.813,365.121,418.165,465.377,441.618,440.763,...,1040.19,1444.76,712.826,598.399,831.893,756.772,1158.8,451.728,450.136,474.502
2,117_at,MAML2,263.754,120.685,571.833,460.261,194.951,209.426,401.461,226.531,...,522.447,234.893,572.661,246.794,170.84,939.708,982.194,26.254,65.0751,86.8124
3,121_at,TPM3,1957.87,1799.67,1247.81,2053.51,1449.89,1287.38,1546.56,960.5,...,916.962,862.08,1597.84,990.33,1125.93,990.765,789.577,590.298,716.527,530.792
4,1255_g_at,FKBP10,1957.87,584.204,273.609,140.218,18.0237,45.1121,96.5229,108.148,...,229.527,95.6581,138.582,38.1301,88.031,88.9213,35.0716,470.547,414.285,199.338


In [3]:
de_genes.head()

Unnamed: 0,ID,adj.P.Val,P.Value,F,Gene.symbol
0,203989_x_at,2e-06,3.77e-11,68.64989,F2R
1,1552735_at,7.7e-05,3.4e-09,41.1047,PCDHGA4
2,235106_at,7.7e-05,4.21e-09,40.09304,MAML2
3,224164_at,0.000149,1.54e-08,34.39294,TPM3
4,219249_s_at,0.000149,1.59e-08,34.25629,FKBP10


<h1 style="color: slateblue; font-weight: bold;">Filtering DEG Based on p.values</h1>

In [4]:
# Filter the DE genes dataset for adjusted P-values less than 0.05
deg_filtered = de_genes[de_genes['adj.P.Val'] < 0.01]

# Display the filtered dataset
deg_filtered.head()


Unnamed: 0,ID,adj.P.Val,P.Value,F,Gene.symbol
0,203989_x_at,2e-06,3.77e-11,68.64989,F2R
1,1552735_at,7.7e-05,3.4e-09,41.1047,PCDHGA4
2,235106_at,7.7e-05,4.21e-09,40.09304,MAML2
3,224164_at,0.000149,1.54e-08,34.39294,TPM3
4,219249_s_at,0.000149,1.59e-08,34.25629,FKBP10


In [5]:
# Correct the column name issue and filter the original dataset
deg_filtered.columns = deg_filtered.columns.str.strip()  # Remove leading/trailing spaces
top_gene = deg_filtered['Gene.symbol'].tolist()

# Filter the original dataset
filtered_dataset = dataset[dataset['Gene Symbol'].isin(top_gene)]

# Display the first few rows of the filtered original dataset
filtered_dataset


Unnamed: 0,Genes,Gene Symbol,"normal tissue,biological rep1","astrocytomas at T1, biological rep1","astrocytomas at T1, biological rep2","astrocytomas at T2, biological rep1","astrocytomas at T2, biological rep2","astrocytomas at T2, biological rep3","astrocytomas at T2, biological rep4","astrocytomas at T2, biological rep5",...,"astrocytomas at T3, biological rep4","astrocytomas at T3, biological rep5","astrocytomas at T4, biological rep1","astrocytomas at T4, biological rep2","astrocytomas at T4, biological rep3","astrocytomas at T4, biological rep4","astrocytomas at T4, biological rep5","normal tissue,biological rep2","normal tissue,biological rep3","normal tissue,biological rep4"
0,1007_s_at,F2R,3303.430,2332.0700,3711.4500,9253.7000,9359.2500,7400.0000,11298.5000,8025.2000,...,4301.5700,9674.6400,8324.0000,8352.9600,12445.7000,5208.74000,8228.8200,1991.0100,2510.5700,3946.86000
1,1053_at,PCDHGA4,338.685,64.0695,325.8130,365.1210,418.1650,465.3770,441.6180,440.7630,...,1040.1900,1444.7600,712.8260,598.3990,831.8930,756.77200,1158.8000,451.7280,450.1360,474.50200
2,117_at,MAML2,263.754,120.6850,571.8330,460.2610,194.9510,209.4260,401.4610,226.5310,...,522.4470,234.8930,572.6610,246.7940,170.8400,939.70800,982.1940,26.2540,65.0751,86.81240
3,121_at,TPM3,1957.870,1799.6700,1247.8100,2053.5100,1449.8900,1287.3800,1546.5600,960.5000,...,916.9620,862.0800,1597.8400,990.3300,1125.9300,990.76500,789.5770,590.2980,716.5270,530.79200
4,1255_g_at,FKBP10,1957.870,584.2040,273.6090,140.2180,18.0237,45.1121,96.5229,108.1480,...,229.5270,95.6581,138.5820,38.1301,88.0310,88.92130,35.0716,470.5470,414.2850,199.33800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54660,AFFX-r2-Ec-bioB-3_at,,754.300,1216.5900,1098.6600,1009.5400,1108.3100,853.9880,839.5230,979.8460,...,1019.1600,828.4490,916.7640,1106.0300,1006.1400,929.28900,640.6590,1769.7400,2371.0500,1826.41000
54666,AFFX-r2-Ec-bioD-5_at,,39415.100,48568.1000,53138.6000,38294.3000,51993.1000,51151.1000,44067.3000,54243.8000,...,50531.4000,42274.7000,42710.5000,55286.0000,47368.4000,48047.30000,39951.9000,67649.1000,81090.3000,67902.60000
54672,AFFX-TrpnX-3_at,,124.010,41.3394,11.2514,26.3329,21.5149,51.2646,20.3047,87.8626,...,28.0181,21.2736,54.4629,81.4983,59.6905,9.67306,29.0763,17.6909,34.6806,5.43828
54674,AFFX-TrpnX-M_at,,,,,,,,,,...,,,,,,,,,,


In [6]:
filtered_dataset = filtered_dataset.dropna()

filtered_dataset

Unnamed: 0,Genes,Gene Symbol,"normal tissue,biological rep1","astrocytomas at T1, biological rep1","astrocytomas at T1, biological rep2","astrocytomas at T2, biological rep1","astrocytomas at T2, biological rep2","astrocytomas at T2, biological rep3","astrocytomas at T2, biological rep4","astrocytomas at T2, biological rep5",...,"astrocytomas at T3, biological rep4","astrocytomas at T3, biological rep5","astrocytomas at T4, biological rep1","astrocytomas at T4, biological rep2","astrocytomas at T4, biological rep3","astrocytomas at T4, biological rep4","astrocytomas at T4, biological rep5","normal tissue,biological rep2","normal tissue,biological rep3","normal tissue,biological rep4"
0,1007_s_at,F2R,3303.430,2332.0700,3711.450,9253.700,9359.2500,7400.0000,11298.5000,8025.200,...,4301.570,9674.6400,8324.000,8352.9600,12445.700,5208.7400,8228.8200,1991.0100,2510.5700,3946.8600
1,1053_at,PCDHGA4,338.685,64.0695,325.813,365.121,418.1650,465.3770,441.6180,440.763,...,1040.190,1444.7600,712.826,598.3990,831.893,756.7720,1158.8000,451.7280,450.1360,474.5020
2,117_at,MAML2,263.754,120.6850,571.833,460.261,194.9510,209.4260,401.4610,226.531,...,522.447,234.8930,572.661,246.7940,170.840,939.7080,982.1940,26.2540,65.0751,86.8124
3,121_at,TPM3,1957.870,1799.6700,1247.810,2053.510,1449.8900,1287.3800,1546.5600,960.500,...,916.962,862.0800,1597.840,990.3300,1125.930,990.7650,789.5770,590.2980,716.5270,530.7920
4,1255_g_at,FKBP10,1957.870,584.2040,273.609,140.218,18.0237,45.1121,96.5229,108.148,...,229.527,95.6581,138.582,38.1301,88.031,88.9213,35.0716,470.5470,414.2850,199.3380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54427,47083_at,FOXA2,229.278,528.7170,211.137,274.448,430.3630,402.9870,312.6290,301.142,...,316.636,559.0800,343.637,307.0480,311.420,461.2230,406.6200,276.8410,240.3010,170.3750
54439,48117_at,TWF1,282.352,63.8753,340.751,1339.190,712.7770,808.7100,950.6820,634.539,...,869.921,1086.2400,616.220,772.3780,461.948,1158.8900,658.3050,419.8710,416.7310,472.0650
54478,52285_f_at,WSB1,402.834,269.9970,429.944,474.214,215.5850,349.7430,412.1240,131.463,...,282.485,216.6240,449.679,174.4150,196.926,335.8490,357.8540,167.8790,124.1870,152.8850
54524,58367_s_at,GATA3,1043.520,1111.2100,612.747,788.786,922.1620,900.0270,1745.0000,899.432,...,875.620,1816.5200,718.516,959.5170,809.563,1115.3500,1179.0700,927.4260,703.1670,833.9700


In [7]:
# Mapping the grades to the column names
grade_mapping = {
    'astrocytomas at T1': 'Grade I',
    'astrocytomas at T2': 'Grade II',
    'astrocytomas at T3': 'Grade III',
    'astrocytomas at T4': 'Grade IV',
    'normal tissue': 'Normal'
}

# Applying the mapping to the column names
for original, grade in grade_mapping.items():
    filtered_dataset.columns = filtered_dataset.columns.str.replace(original, grade)

# Display the first few rows after renaming
filtered_dataset.head()


Unnamed: 0,Genes,Gene Symbol,"Normal,biological rep1","Grade I, biological rep1","Grade I, biological rep2","Grade II, biological rep1","Grade II, biological rep2","Grade II, biological rep3","Grade II, biological rep4","Grade II, biological rep5",...,"Grade III, biological rep4","Grade III, biological rep5","Grade IV, biological rep1","Grade IV, biological rep2","Grade IV, biological rep3","Grade IV, biological rep4","Grade IV, biological rep5","Normal,biological rep2","Normal,biological rep3","Normal,biological rep4"
0,1007_s_at,F2R,3303.43,2332.07,3711.45,9253.7,9359.25,7400.0,11298.5,8025.2,...,4301.57,9674.64,8324.0,8352.96,12445.7,5208.74,8228.82,1991.01,2510.57,3946.86
1,1053_at,PCDHGA4,338.685,64.0695,325.813,365.121,418.165,465.377,441.618,440.763,...,1040.19,1444.76,712.826,598.399,831.893,756.772,1158.8,451.728,450.136,474.502
2,117_at,MAML2,263.754,120.685,571.833,460.261,194.951,209.426,401.461,226.531,...,522.447,234.893,572.661,246.794,170.84,939.708,982.194,26.254,65.0751,86.8124
3,121_at,TPM3,1957.87,1799.67,1247.81,2053.51,1449.89,1287.38,1546.56,960.5,...,916.962,862.08,1597.84,990.33,1125.93,990.765,789.577,590.298,716.527,530.792
4,1255_g_at,FKBP10,1957.87,584.204,273.609,140.218,18.0237,45.1121,96.5229,108.148,...,229.527,95.6581,138.582,38.1301,88.031,88.9213,35.0716,470.547,414.285,199.338


In [8]:
# Define column renaming mapping based on the provided grouping and grading
column_mapping = {
    **{col: f"Grade I_{col.split(', ')[1]}" for col in filtered_dataset.columns if "T1" in col},
    **{col: f"Grade II_{col.split(', ')[1]}" for col in filtered_dataset.columns if "T2" in col},
    **{col: f"Grade III_{col.split(', ')[1]}" for col in filtered_dataset.columns if "T3" in col},
    **{col: f"Grade IV_{col.split(', ')[1]}" for col in filtered_dataset.columns if "T4" in col},
    **{col: f"Normal_{col.split(', ')[1]}" for col in filtered_dataset.columns if "normal" in col}
}

# Apply the column renaming
graded_dataset = filtered_dataset.rename(columns=column_mapping)

# Display the first few rows of the newly graded dataset
graded_dataset.head()


Unnamed: 0,Genes,Gene Symbol,"Normal,biological rep1","Grade I, biological rep1","Grade I, biological rep2","Grade II, biological rep1","Grade II, biological rep2","Grade II, biological rep3","Grade II, biological rep4","Grade II, biological rep5",...,"Grade III, biological rep4","Grade III, biological rep5","Grade IV, biological rep1","Grade IV, biological rep2","Grade IV, biological rep3","Grade IV, biological rep4","Grade IV, biological rep5","Normal,biological rep2","Normal,biological rep3","Normal,biological rep4"
0,1007_s_at,F2R,3303.43,2332.07,3711.45,9253.7,9359.25,7400.0,11298.5,8025.2,...,4301.57,9674.64,8324.0,8352.96,12445.7,5208.74,8228.82,1991.01,2510.57,3946.86
1,1053_at,PCDHGA4,338.685,64.0695,325.813,365.121,418.165,465.377,441.618,440.763,...,1040.19,1444.76,712.826,598.399,831.893,756.772,1158.8,451.728,450.136,474.502
2,117_at,MAML2,263.754,120.685,571.833,460.261,194.951,209.426,401.461,226.531,...,522.447,234.893,572.661,246.794,170.84,939.708,982.194,26.254,65.0751,86.8124
3,121_at,TPM3,1957.87,1799.67,1247.81,2053.51,1449.89,1287.38,1546.56,960.5,...,916.962,862.08,1597.84,990.33,1125.93,990.765,789.577,590.298,716.527,530.792
4,1255_g_at,FKBP10,1957.87,584.204,273.609,140.218,18.0237,45.1121,96.5229,108.148,...,229.527,95.6581,138.582,38.1301,88.031,88.9213,35.0716,470.547,414.285,199.338


In [9]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Prepare the data by averaging the replicates for each grade
grades = ['Grade I', 'Grade II', 'Grade III', 'Grade IV', 'Normal']
for grade in grades:
    replicate_columns = [col for col in graded_dataset.columns if grade in col]
    graded_dataset[grade] = graded_dataset[replicate_columns].mean(axis=1)

# Create features and labels for binary classification
X = graded_dataset[grades].values  # Features
y = (graded_dataset['Grade II'] > 0).astype(int)  # Binary labels where Grade II is 1, others are 0

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [10]:
# Calculate the maximum grade value per sample and the corresponding index (grade)
max_grades = graded_dataset[['Grade I', 'Grade II', 'Grade III', 'Grade IV']].idxmax(axis=1)
grade_mapping = {'Grade I': 1, 'Grade II': 2, 'Grade III': 3, 'Grade IV': 4}

# Map the grade names to numeric labels
y_multiclass_corrected = max_grades.map(grade_mapping).values

# Verify the corrected label distribution again
corrected_label_distribution = pd.Series(y_multiclass_corrected).value_counts()
corrected_label_distribution


4    1456
2    1181
3    1150
1    1045
Name: count, dtype: int64

<h1 style="color: slateblue; font-weight: bold;">Random Forest</h1>

In [11]:
from sklearn.metrics import classification_report

X_train_mc_corrected, X_test_mc_corrected, y_train_mc_corrected, y_test_mc_corrected = train_test_split(
    X, y_multiclass_corrected, test_size=0.2, random_state=42, stratify=y_multiclass_corrected)


rf_classifier_mc_corrected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_mc_corrected.fit(X_train_mc_corrected, y_train_mc_corrected)
y_pred_mc_corrected = rf_classifier_mc_corrected.predict(X_test_mc_corrected)
accuracy_mc_corrected = accuracy_score(y_test_mc_corrected, y_pred_mc_corrected)

cv_scores_mc_corrected = cross_val_score(rf_classifier_mc_corrected, X, y_multiclass_corrected, cv=5)

# Correct the target names for classification report (excluding 'Normal' as it's not a label in this classification)
report_corrected = classification_report(
    y_test_mc_corrected,
    y_pred_mc_corrected,
    target_names=['Grade I', 'Grade II', 'Grade III', 'Grade IV']
)

# Show accuracy, cross-validation results, and classification report for the corrected labels
accuracy_mc_corrected, cv_scores_mc_corrected, np.mean(cv_scores_mc_corrected), report_corrected


(0.8459152016546019,
 array([0.85108583, 0.83557394, 0.83022774, 0.78985507, 0.85300207]),
 0.8319489317241218,
 '              precision    recall  f1-score   support\n\n     Grade I       0.82      0.75      0.79       209\n    Grade II       0.80      0.82      0.81       236\n   Grade III       0.88      0.85      0.87       230\n    Grade IV       0.87      0.93      0.90       292\n\n    accuracy                           0.85       967\n   macro avg       0.84      0.84      0.84       967\nweighted avg       0.85      0.85      0.85       967\n')

<h1 style="color: slateblue; font-weight: bold;">CNB</h1>

In [13]:
from sklearn.naive_bayes import ComplementNB

# Initialize and train a Complement Naive Bayes classifier for multiclass classification
cnb_classifier = ComplementNB()
cnb_classifier.fit(X_train_mc_corrected, y_train_mc_corrected)

# Predict labels and calculate accuracy
y_pred_cnb = cnb_classifier.predict(X_test_mc_corrected)
accuracy_cnb = accuracy_score(y_test_mc_corrected, y_pred_cnb)

# Perform cross-validation to evaluate the model
cv_scores_cnb = cross_val_score(cnb_classifier, X, y_multiclass_corrected, cv=5)

# Generate classification report for CNB classifier
report_cnb = classification_report(
    y_test_mc_corrected,
    y_pred_cnb,
    target_names=['Grade I', 'Grade II', 'Grade III', 'Grade IV']
)

# Show accuracy, cross-validation results, and classification report for CNB classifier
accuracy_cnb, cv_scores_cnb, np.mean(cv_scores_cnb), report_cnb



(0.5470527404343329,
 array([0.5139607 , 0.59565667, 0.53726708, 0.5952381 , 0.5610766 ]),
 0.5606398307715695,
 '              precision    recall  f1-score   support\n\n     Grade I       0.42      0.46      0.44       209\n    Grade II       0.73      0.15      0.25       236\n   Grade III       0.63      0.64      0.64       230\n    Grade IV       0.55      0.86      0.67       292\n\n    accuracy                           0.55       967\n   macro avg       0.58      0.53      0.50       967\nweighted avg       0.58      0.55      0.51       967\n')

<h1 style="color: slateblue; font-weight: bold;">ANN</h1>

In [14]:
from sklearn.neural_network import MLPClassifier

# Initialize and train an Artificial Neural Network (ANN) classifier for multiclass classification
ann_classifier = MLPClassifier(random_state=42)
ann_classifier.fit(X_train_mc_corrected, y_train_mc_corrected)

# Predict labels and calculate accuracy
y_pred_ann = ann_classifier.predict(X_test_mc_corrected)
accuracy_ann = accuracy_score(y_test_mc_corrected, y_pred_ann)

# Perform cross-validation to evaluate the model
cv_scores_ann = cross_val_score(ann_classifier, X, y_multiclass_corrected, cv=5)

# Generate classification report for ANN classifier
report_ann = classification_report(
    y_test_mc_corrected,
    y_pred_ann,
    target_names=['Grade I', 'Grade II', 'Grade III', 'Grade IV']
)

# Show accuracy, cross-validation results, and classification report for ANN classifier
accuracy_ann, cv_scores_ann, np.mean(cv_scores_ann), report_ann


(0.7838676318510859,
 array([0.91830403, 0.73836608, 0.82298137, 0.73291925, 0.89337474]),
 0.8211890952145439,
 '              precision    recall  f1-score   support\n\n     Grade I       0.98      0.23      0.37       209\n    Grade II       0.77      0.86      0.81       236\n   Grade III       0.82      0.94      0.87       230\n    Grade IV       0.75      1.00      0.86       292\n\n    accuracy                           0.78       967\n   macro avg       0.83      0.76      0.73       967\nweighted avg       0.82      0.78      0.74       967\n')

In [15]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for grid search
param_grid = {
    'hidden_layer_sizes': [(100,), (200,), (300,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

# Initialize the ANN classifier
ann_classifier_tuned = MLPClassifier(random_state=42)

# Perform grid search to find the best parameters
grid_search = GridSearchCV(estimator=ann_classifier_tuned, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_mc_corrected, y_train_mc_corrected)

# Print the best parameters found by grid search
best_params = grid_search.best_params_
best_params



Fitting 5 folds for each of 108 candidates, totalling 540 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


{'activation': 'relu',
 'alpha': 0.01,
 'hidden_layer_sizes': (300,),
 'learning_rate': 'constant',
 'solver': 'lbfgs'}

In [16]:
# Use the best parameters obtained from grid search to train the ANN model
best_ann_classifier = MLPClassifier(random_state=42, **best_params)
best_ann_classifier.fit(X_train_mc_corrected, y_train_mc_corrected)

# Predict labels and calculate accuracy
y_pred_best_ann = best_ann_classifier.predict(X_test_mc_corrected)
accuracy_best_ann = accuracy_score(y_test_mc_corrected, y_pred_best_ann)

# Perform cross-validation to evaluate the model
cv_scores_best_ann = cross_val_score(best_ann_classifier, X, y_multiclass_corrected, cv=5)

# Generate classification report for ANN classifier with best parameters
report_best_ann = classification_report(
    y_test_mc_corrected,
    y_pred_best_ann,
    target_names=['Grade I', 'Grade II', 'Grade III', 'Grade IV']
)

# Show accuracy, cross-validation results, and classification report for ANN classifier with best parameters
accuracy_best_ann, cv_scores_best_ann, np.mean(cv_scores_best_ann), report_best_ann


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

(0.9689762150982419,
 array([0.97001034, 0.97207859, 0.97929607, 0.98136646, 0.98240166]),
 0.9770306234089337,
 '              precision    recall  f1-score   support\n\n     Grade I       0.96      0.97      0.96       209\n    Grade II       0.98      0.97      0.97       236\n   Grade III       0.99      0.94      0.96       230\n    Grade IV       0.96      0.99      0.97       292\n\n    accuracy                           0.97       967\n   macro avg       0.97      0.97      0.97       967\nweighted avg       0.97      0.97      0.97       967\n')

In [17]:
import joblib

# Save the trained model
joblib.dump(best_ann_classifier, 'best_ann_classifier_model.pkl')


['best_ann_classifier_model.pkl']

In [34]:
import joblib
import numpy as np

# Load the trained model
best_ann_classifier = joblib.load('best_ann_classifier_model.pkl')

# Define a function to predict cancer grade
def predict_cancer_grade(gene_expression_values):
    # Preprocess the input data if necessary (scaling, normalization)
    # Ensure gene_expression_values is in the correct format (numpy array or pandas DataFrame)
    input_data = np.array([gene_expression_values])  # Example: Convert input to numpy array
    
    # Predict cancer grade
    predicted_grade = best_ann_classifier.predict(input_data)
    
    return predicted_grade[0]  # Assuming single prediction

# Example usage:
gene_values = [290.955, 576.87, 252.19, 139.017, 163.188]  # Example gene expression values
predicted_grade = predict_cancer_grade(gene_values)
print(f"Predicted Cancer Grade: {predicted_grade}")


Predicted Cancer Grade: 2


<h1 style="color: slateblue; font-weight: bold;">SVM</h1>

In [35]:
from sklearn.svm import SVC

# Initialize and train a Support Vector Machine classifier for multiclass classification
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_mc_corrected, y_train_mc_corrected)

# Predict labels and calculate accuracy
y_pred_svm = svm_classifier.predict(X_test_mc_corrected)
accuracy_svm = accuracy_score(y_test_mc_corrected, y_pred_svm)

# Perform cross-validation to evaluate the model
cv_scores_svm = cross_val_score(svm_classifier, X, y_multiclass_corrected, cv=5)

# Generate classification report for SVM classifier
report_svm = classification_report(
    y_test_mc_corrected,
    y_pred_svm,
    target_names=['Grade I', 'Grade II', 'Grade III', 'Grade IV']
)

# Show accuracy, cross-validation results, and classification report for SVM classifier
accuracy_svm, cv_scores_svm, np.mean(cv_scores_svm), report_svm


(0.9917269906928645,
 array([0.99689762, 0.98655636, 0.99482402, 0.99482402, 0.98861284]),
 0.9923429701901891,
 '              precision    recall  f1-score   support\n\n     Grade I       1.00      0.99      0.99       209\n    Grade II       1.00      0.99      0.99       236\n   Grade III       1.00      0.99      0.99       230\n    Grade IV       0.98      1.00      0.99       292\n\n    accuracy                           0.99       967\n   macro avg       0.99      0.99      0.99       967\nweighted avg       0.99      0.99      0.99       967\n')

In [36]:
import joblib

# Save the trained model
joblib.dump(svm_classifier, 'svm_classifier_model.pkl')


['svm_classifier_model.pkl']

In [37]:
import joblib
import numpy as np

# Load the trained model
svm_classifier = joblib.load('svm_classifier_model.pkl')

# Define a function to predict cancer grade
def predict_cancer_grade(gene_expression_values):
    # Preprocess the input data if necessary (scaling, normalization)
    # Ensure gene_expression_values is in the correct format (numpy array or pandas DataFrame)
    input_data = np.array([gene_expression_values])  # Example: Convert input to numpy array
    
    # Predict cancer grade
    predicted_grade = svm_classifier.predict(input_data)
    
    return predicted_grade[0]  # Assuming single prediction

# Example usage:
gene_values = [290.955, 576.87, 252.19, 139.017, 163.188]  # Example gene expression values
predicted_grade = predict_cancer_grade(gene_values)
print(f"Predicted Cancer Grade: {predicted_grade}")


Predicted Cancer Grade: 2


<h1 style="color: slateblue; font-weight: bold;">KNN</h1>

In [39]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train a k-Nearest Neighbors classifier for multiclass classification
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_mc_corrected, y_train_mc_corrected)

# Predict labels and calculate accuracy
y_pred_knn = knn_classifier.predict(X_test_mc_corrected)
accuracy_knn = accuracy_score(y_test_mc_corrected, y_pred_knn)

# Perform cross-validation to evaluate the model
cv_scores_knn = cross_val_score(knn_classifier, X, y_multiclass_corrected, cv=5)

# Generate classification report for KNN classifier
report_knn = classification_report(
    y_test_mc_corrected,
    y_pred_knn,
    target_names=['Grade I', 'Grade II', 'Grade III', 'Grade IV']
)

# Show accuracy, cross-validation results, and classification report for KNN classifier
accuracy_knn, cv_scores_knn, np.mean(cv_scores_knn), report_knn


(0.7280248190279214,
 array([0.70630817, 0.70527404, 0.7194617 , 0.7194617 , 0.71014493]),
 0.7121301072022712,
 '              precision    recall  f1-score   support\n\n     Grade I       0.59      0.78      0.67       209\n    Grade II       0.63      0.63      0.63       236\n   Grade III       0.82      0.71      0.76       230\n    Grade IV       0.89      0.79      0.83       292\n\n    accuracy                           0.73       967\n   macro avg       0.73      0.73      0.72       967\nweighted avg       0.75      0.73      0.73       967\n')

<h1 style="color: slateblue; font-weight: bold;">Model Comparison</h1>

In [40]:
# List of classifiers and their names
classifiers = [
    ('SVM', svm_classifier),
    ('CNB', cnb_classifier),
    ('ANN', best_ann_classifier),
    ('KNN', knn_classifier),
    ('Random Forest', rf_classifier_mc_corrected)
]

# Dictionary to store accuracies
accuracies = {}

# Creating a comparative table for accuracies
accuracy_table = pd.DataFrame(accuracies).transpose()


accuracy_table


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Unnamed: 0,Accuracy Mean,Accuracy Std
SVM,0.992343,0.004011
CNB,0.56064,0.032089
ANN,0.977031,0.005032
KNN,0.71213,0.006202
Random Forest,0.831949,0.022792
