In [74]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [75]:
# Import the data

df_full = pd.read_csv("/Users/parinazfathi/Documents/GitHub/ErdosFall2024ProteinProfiles/Combined_df2.csv")

proteins=df_full.columns[4:]

In [76]:
# Combine all the blood cancers into one category
df_bloodcombined = df_full.copy(deep = True)
df_bloodcombined=df_bloodcombined.replace(to_replace = ['AML', 'CLL', 'LYMPH', 'MYEL', 'HODG'], value = 'BLOOD' )
set(df_bloodcombined['Cancer'])

{'BLOOD',
 'BRC',
 'CRC',
 'CVX',
 'Ctrl',
 'ENDC',
 'ESO',
 'GLIOM',
 'LUNGC',
 'OVC',
 'PRC'}

In [77]:
# Encode the cancer types as numbers

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_bloodcombined['Cancer'] = le.fit_transform(df_bloodcombined['Cancer'])
df_bloodcombined.head(15)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Sample_ID,Cancer,Q15389,P29965,P49763,Q02763,P01127,P09341,...,P09382,Q16790,P26842,P14210,P43489,O75144,O43927,P32970,Q8WXI7,P10144
0,0,0.0,PM910,4,10.12895,5.62622,7.72902,8.4753,10.08926,10.58751,...,8.54519,4.20874,10.61404,8.37166,4.99296,7.78219,7.03584,4.81481,3.58965,4.13563
1,1,1.0,PM396,4,8.67289,5.18821,8.25523,9.06271,8.89866,9.2236,...,8.25401,3.46839,10.77271,8.3682,4.92422,7.47997,8.057,3.989,4.78155,3.1484
2,2,2.0,PM190,4,9.99567,6.38876,8.44263,8.42102,10.08508,10.43894,...,8.75887,4.65936,11.03062,9.18464,5.60743,7.92803,8.77261,4.80189,5.1635,4.29062
3,3,3.0,PM270,4,8.26407,5.06228,8.13429,8.66165,8.75925,9.2431,...,8.75741,4.44633,10.47952,8.65548,5.54289,9.29458,8.64028,4.04045,5.59217,3.75295
4,4,4.0,PM656,4,9.08833,5.95005,8.3783,8.29127,9.43936,9.83732,...,8.48018,3.81634,10.59295,8.63758,5.16271,7.41098,8.29143,4.59594,4.91665,4.10381
5,5,5.0,PM736,4,8.64457,5.79507,8.27732,8.37578,8.60373,9.03092,...,8.76805,5.53215,10.41412,8.70804,5.83762,7.71496,7.50007,5.59281,5.25662,3.88111
6,6,6.0,PM842,4,10.06218,5.0638,7.74244,8.41381,9.86604,10.05079,...,8.56072,4.13384,10.50253,8.85963,4.95377,7.49059,7.39059,4.0135,5.06355,3.70582
7,7,7.0,PM767,4,9.86577,7.3084,8.29889,8.60954,11.17774,10.68756,...,8.95749,4.62731,11.00948,9.47875,5.84993,7.84098,8.23261,4.39093,5.66446,3.84306
8,8,8.0,PM1497,4,8.37279,5.91951,8.51155,8.31815,8.95261,9.1053,...,8.58994,4.68348,10.49201,8.60244,5.16184,7.31519,7.61259,4.02262,4.4467,3.3108
9,9,9.0,PM746,4,10.34566,5.71417,8.05251,8.38463,10.61735,10.4489,...,8.64534,4.59777,10.50475,8.67217,5.3083,7.66071,7.24119,4.71803,4.29578,3.78925


In [78]:
# Split the data into X and y
X = df_bloodcombined[proteins]
y = df_bloodcombined['Cancer']

In [79]:
set(df_bloodcombined['Cancer'])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [80]:
# Split data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=276)

# Random states tested: 313, 100, 892, 276

In [81]:
# Create the xgboost model

model = xgb.XGBClassifier()

In [82]:
# Train the xgboost model

model.fit(X_train, y_train)

In [83]:
# Use the model to make predictions

y_predict = model.predict(X_test)

In [84]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [85]:
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy:", accuracy)

Accuracy: 0.6381118881118881


In [86]:
print("Classification Report:\n", classification_report(y_test, y_predict, target_names=le.classes_))

Classification Report:
               precision    recall  f1-score   support

       BLOOD       0.89      0.84      0.87       111
         BRC       0.32      0.35      0.33        40
         CRC       0.42      0.40      0.41        60
         CVX       0.50      0.34      0.41        35
        Ctrl       1.00      1.00      1.00        63
        ENDC       0.38      0.50      0.43        22
         ESO       1.00      0.95      0.97        40
       GLIOM       0.70      0.58      0.64        36
       LUNGC       0.45      0.50      0.47        76
         OVC       0.58      0.66      0.62        38
         PRC       0.47      0.51      0.49        51

    accuracy                           0.64       572
   macro avg       0.61      0.60      0.60       572
weighted avg       0.65      0.64      0.64       572



In [87]:
# Print Confusion matrix
print("Confusion Matrix:\n",confusion_matrix(y_test, y_predict))

Confusion Matrix:
 [[93  1  1  0  0  3  0  0  4  3  6]
 [ 1 14  8  4  0  3  0  2  4  0  4]
 [ 0 10 24  2  0  1  0  2 14  1  6]
 [ 0  5  2 12  0  5  0  3  6  2  0]
 [ 0  0  0  0 63  0  0  0  0  0  0]
 [ 2  0  0  1  0 11  0  0  6  0  2]
 [ 2  0  0  0  0  0 38  0  0  0  0]
 [ 0  4  1  3  0  0  0 21  0  1  6]
 [ 1  2 17  0  0  1  0  2 38 11  4]
 [ 1  2  1  1  0  1  0  0  6 25  1]
 [ 4  6  3  1  0  4  0  0  7  0 26]]
