In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.pipeline import Pipeline

from feature_engine.encoding import RareLabelEncoder

In [48]:
data = pd.read_csv("lung_cancer_data.csv")
data.head()

Unnamed: 0,Patient_ID,Age,Gender,Smoking_History,Tumor_Size_mm,Tumor_Location,Stage,Treatment,Survival_Months,Ethnicity,...,Alanine_Aminotransferase_Level,Aspartate_Aminotransferase_Level,Creatinine_Level,LDH_Level,Calcium_Level,Phosphorus_Level,Glucose_Level,Potassium_Level,Sodium_Level,Smoking_Pack_Years
0,Patient0000,68,Male,Current Smoker,81.678677,Lower Lobe,Stage III,Surgery,44,Hispanic,...,27.985571,46.801214,1.245849,239.240255,10.366307,3.547734,113.919243,4.968163,139.822861,17.006956
1,Patient0001,58,Male,Never Smoked,78.448272,Lower Lobe,Stage I,Radiation Therapy,101,Caucasian,...,30.120956,39.711531,1.463231,233.515237,10.081731,2.94502,101.321578,3.896795,135.449361,93.270893
2,Patient0002,44,Male,Former Smoker,67.714305,Lower Lobe,Stage I,Chemotherapy,69,African American,...,5.882418,32.640602,0.630109,169.03746,8.660892,4.637399,78.214177,4.36905,143.377155,70.348376
3,Patient0003,72,Male,Current Smoker,70.806008,Lower Lobe,Stage III,Chemotherapy,95,African American,...,38.908154,44.319393,0.594342,213.96759,8.832669,3.617098,127.895361,4.348474,138.586005,19.828128
4,Patient0004,37,Female,Never Smoked,87.272433,Lower Lobe,Stage IV,Radiation Therapy,105,Asian,...,26.344877,15.746906,1.478239,118.187543,9.247609,4.773255,148.801185,3.671976,141.230724,81.047456


In [3]:
target = "Stage"
NUMERICAL_VARIABLES = [var for var in data.columns if data[var].dtype != "O"]
CATEGORICAL_VARIABLES = [var for var in data.columns if data[var].dtype == "O" and var != target and var != "Patient_ID"]
print("Numerical variables:",NUMERICAL_VARIABLES)
print()
print("Categorical variables:",CATEGORICAL_VARIABLES)

Numerical variables: ['Age', 'Tumor_Size_mm', 'Survival_Months', 'Performance_Status', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Blood_Pressure_Pulse', 'Hemoglobin_Level', 'White_Blood_Cell_Count', 'Platelet_Count', 'Albumin_Level', 'Alkaline_Phosphatase_Level', 'Alanine_Aminotransferase_Level', 'Aspartate_Aminotransferase_Level', 'Creatinine_Level', 'LDH_Level', 'Calcium_Level', 'Phosphorus_Level', 'Glucose_Level', 'Potassium_Level', 'Sodium_Level', 'Smoking_Pack_Years']

Categorical variables: ['Gender', 'Smoking_History', 'Tumor_Location', 'Treatment', 'Ethnicity', 'Insurance_Type', 'Family_History', 'Comorbidity_Diabetes', 'Comorbidity_Hypertension', 'Comorbidity_Heart_Disease', 'Comorbidity_Chronic_Lung_Disease', 'Comorbidity_Kidney_Disease', 'Comorbidity_Autoimmune_Disease', 'Comorbidity_Other']


In [4]:
for var in NUMERICAL_VARIABLES:
    print(data[var].isnull().sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [5]:
for var in CATEGORICAL_VARIABLES:
    print(data[var].isnull().sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [6]:
x_train,x_test,y_train,y_test = train_test_split(
    data.drop([target,"Patient_ID"],axis=1),
    data[target],
    test_size=0.2,
    random_state=0
)

In [7]:
x_test.to_csv("x_test.csv",index=False)
y_test.to_csv("y_test.csv",index=False)

In [8]:
rare_encoder = RareLabelEncoder(tol=0.05,n_categories=1,variables=CATEGORICAL_VARIABLES,replace_with="Rare")
x_train = rare_encoder.fit_transform(x_train,y_train)



In [9]:
x_train.head(10)

Unnamed: 0,Age,Gender,Smoking_History,Tumor_Size_mm,Tumor_Location,Treatment,Survival_Months,Ethnicity,Insurance_Type,Family_History,...,Alanine_Aminotransferase_Level,Aspartate_Aminotransferase_Level,Creatinine_Level,LDH_Level,Calcium_Level,Phosphorus_Level,Glucose_Level,Potassium_Level,Sodium_Level,Smoking_Pack_Years
13227,77,Male,Never Smoked,45.089524,Middle Lobe,Chemotherapy,116,Caucasian,Other,No,...,29.87546,13.288436,0.82022,170.98966,9.510741,3.522938,142.307075,3.531541,139.931057,62.657193
17298,49,Male,Current Smoker,37.946508,Middle Lobe,Surgery,51,Other,Medicare,No,...,7.492468,13.37308,1.499243,192.421219,9.551318,4.285336,79.775848,4.486401,143.877707,28.280746
8351,57,Male,Never Smoked,43.190785,Lower Lobe,Surgery,27,Asian,Other,No,...,30.419042,25.540987,0.943008,174.928601,8.816722,2.986674,128.531987,3.974762,140.842301,76.297955
7050,44,Female,Former Smoker,31.656231,Middle Lobe,Targeted Therapy,44,Other,Other,Yes,...,12.453917,48.333837,0.961104,243.533484,9.158883,3.605071,95.042954,3.530903,143.288263,20.063274
12234,70,Male,Current Smoker,74.293064,Upper Lobe,Radiation Therapy,63,Hispanic,Other,Yes,...,10.520105,41.694537,1.396543,235.895487,8.036727,3.886132,103.062468,3.884968,135.877452,17.478732
6184,48,Female,Never Smoked,52.03193,Middle Lobe,Surgery,54,African American,Private,Yes,...,27.278083,39.540511,1.1973,246.339535,9.886672,3.359898,83.94986,4.099515,143.785854,22.533684
6224,33,Male,Never Smoked,86.955708,Middle Lobe,Radiation Therapy,62,Other,Private,Yes,...,15.360368,14.856116,1.412864,120.009058,9.228586,3.705003,78.963789,4.454113,143.580954,68.681207
16463,67,Male,Never Smoked,43.899173,Middle Lobe,Chemotherapy,30,Other,Medicare,Yes,...,6.714411,29.649957,0.932636,190.941693,8.836017,4.620777,95.016786,4.320075,143.823241,5.552756
11758,67,Female,Never Smoked,61.312171,Upper Lobe,Chemotherapy,5,Caucasian,Medicare,Yes,...,39.867274,48.191139,0.704283,124.567387,9.504511,3.90958,136.183434,4.405398,138.596075,4.61858
3079,50,Male,Current Smoker,91.778487,Middle Lobe,Chemotherapy,106,Hispanic,Medicare,No,...,36.460935,36.238819,0.785621,114.126449,10.186893,4.545755,101.152765,4.407418,138.586663,57.125201


In [10]:
class CustomLabelEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,variables):
        self.variables = variables
        self.le = LabelEncoder()
    def fit(self,x,y):
        return self
    def transform(self,x):
        x = x.copy()

        for variable in self.variables:
            x[variable] = self.le.fit_transform(x[variable])
        return x

In [11]:
cle = CustomLabelEncoder(CATEGORICAL_VARIABLES)
x_train = cle.transform(x_train)


In [12]:
x_train

Unnamed: 0,Age,Gender,Smoking_History,Tumor_Size_mm,Tumor_Location,Treatment,Survival_Months,Ethnicity,Insurance_Type,Family_History,...,Alanine_Aminotransferase_Level,Aspartate_Aminotransferase_Level,Creatinine_Level,LDH_Level,Calcium_Level,Phosphorus_Level,Glucose_Level,Potassium_Level,Sodium_Level,Smoking_Pack_Years
13227,77,1,2,45.089524,1,0,116,2,2,0,...,29.875460,13.288436,0.820220,170.989660,9.510741,3.522938,142.307075,3.531541,139.931057,62.657193
17298,49,1,0,37.946508,1,2,51,4,1,0,...,7.492468,13.373080,1.499243,192.421219,9.551318,4.285336,79.775848,4.486401,143.877707,28.280746
8351,57,1,2,43.190785,0,2,27,1,2,0,...,30.419042,25.540987,0.943008,174.928601,8.816722,2.986674,128.531987,3.974762,140.842301,76.297955
7050,44,0,1,31.656231,1,3,44,4,2,1,...,12.453917,48.333837,0.961104,243.533484,9.158883,3.605071,95.042954,3.530903,143.288263,20.063274
12234,70,1,0,74.293064,2,1,63,3,2,1,...,10.520105,41.694537,1.396543,235.895487,8.036727,3.886132,103.062468,3.884968,135.877452,17.478732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,75,0,0,70.340621,2,0,30,4,1,1,...,7.531046,22.253273,0.725508,187.630881,9.417124,3.479566,70.214203,4.641143,141.894246,39.206700
19648,37,0,0,51.152906,1,2,1,1,2,1,...,31.935863,20.565866,0.636278,180.646746,9.701244,4.885477,132.337325,3.804945,140.500781,89.940652
9845,54,1,1,53.052543,1,1,74,3,2,1,...,37.801994,27.062205,1.040124,164.151515,8.536603,2.799824,140.008369,3.904263,136.042267,86.396181
10799,42,0,2,27.199144,0,1,70,2,0,0,...,9.111703,33.068827,1.153768,125.591268,10.141961,3.202503,114.720235,3.701385,142.800805,49.569632


In [13]:
sc = StandardScaler()
sc.fit(x_train,y_train)
x_train = pd.DataFrame(sc.transform(x_train),columns=x_train.columns)

In [14]:
x_train.head()

Unnamed: 0,Age,Gender,Smoking_History,Tumor_Size_mm,Tumor_Location,Treatment,Survival_Months,Ethnicity,Insurance_Type,Family_History,...,Alanine_Aminotransferase_Level,Aspartate_Aminotransferase_Level,Creatinine_Level,LDH_Level,Calcium_Level,Phosphorus_Level,Glucose_Level,Potassium_Level,Sodium_Level,Smoking_Pack_Years
0,1.569619,0.996413,1.225251,-0.396399,-0.004276,-1.338228,1.63542,0.003669,0.448735,-0.980016,...,0.729865,-1.452321,-0.622754,-0.084627,0.347122,-0.30368,1.396746,-1.651313,-0.039241,0.449671
1,-0.379465,0.996413,-1.233566,-0.671441,-0.004276,0.458544,-0.259094,1.420627,-0.443361,-0.980016,...,-1.494889,-1.445017,1.733091,0.410329,0.403561,0.755217,-1.309588,0.557816,1.322794,-0.742705
2,0.177416,0.996413,1.225251,-0.46951,-1.230556,0.458544,-0.958607,-0.704811,0.448735,-0.980016,...,0.783895,-0.395028,-0.196744,0.006342,-0.618193,-1.048499,0.800564,-0.625893,0.27524,0.922812
3,-0.727515,-1.003599,-0.004157,-0.913648,-0.004276,1.356931,-0.463119,1.420627,0.448735,1.020392,...,-1.001747,1.571805,-0.133959,1.590754,-0.14228,-0.189605,-0.648832,-1.652789,1.11937,-1.027735
4,1.082348,0.996413,-1.233566,0.728084,1.222003,-0.439842,0.090662,0.712148,0.448735,1.020392,...,-1.193958,0.998888,1.376779,1.414356,-1.703092,0.200762,-0.30175,-0.833636,-1.438188,-1.117382


In [15]:
classifier = LogisticRegression(C=0.0005)
feature_selector = SelectFromModel(classifier)
feature_selector.fit(x_train,y_train)

In [16]:
feature_selector.get_support()

array([False, False,  True, False,  True, False, False,  True,  True,
       False, False, False,  True, False, False,  True,  True, False,
        True, False,  True, False,  True, False,  True, False, False,
        True,  True, False,  True, False,  True,  True,  True,  True])

In [17]:
selected_features = x_train.columns[(feature_selector.get_support())]

In [18]:
print('total features: {}'.format((x_train.shape[1])))
print('selected features: {}'.format(len(selected_features)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(feature_selector.estimator_.coef_ == 0)))

total features: 36
selected features: 18
features with coefficients shrank to zero: 0


In [19]:
selected_features = selected_features.to_list()

In [20]:
selected_features

['Smoking_History',
 'Tumor_Location',
 'Ethnicity',
 'Insurance_Type',
 'Comorbidity_Heart_Disease',
 'Comorbidity_Autoimmune_Disease',
 'Comorbidity_Other',
 'Blood_Pressure_Systolic',
 'Blood_Pressure_Pulse',
 'White_Blood_Cell_Count',
 'Albumin_Level',
 'Aspartate_Aminotransferase_Level',
 'Creatinine_Level',
 'Calcium_Level',
 'Glucose_Level',
 'Potassium_Level',
 'Sodium_Level',
 'Smoking_Pack_Years']

In [21]:
pd.Series(selected_features).to_csv('selected_features.csv', index=False)

In [22]:
le = CustomLabelEncoder(CATEGORICAL_VARIABLES)


In [23]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [24]:
y_train

array([0, 0, 3, ..., 3, 1, 2])

In [25]:
x_train = x_train[selected_features]

In [26]:
classifier.fit(x_train,y_train)

In [27]:
x_test = pd.read_csv("x_test.csv")
y_test = pd.read_csv("y_test.csv")


In [28]:
y_test

Unnamed: 0,Stage
0,Stage II
1,Stage IV
2,Stage I
3,Stage I
4,Stage III
...,...
4727,Stage IV
4728,Stage III
4729,Stage I
4730,Stage III


In [29]:
x_test = cle.transform(x_test)

In [30]:
x_test = x_test[selected_features]
y_test = le.transform(y_test)

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [31]:
y_pred = classifier.predict(x_test)
y_pred_proba = classifier.predict_proba(x_test)

In [32]:
acc = accuracy_score(y_test,y_pred)
print("accuracy_score:",acc)
print()
roc = roc_auc_score(y_test,y_pred_proba,multi_class="ovr")
print("roc_auc_score",roc)

accuracy_score: 0.24746407438715132

roc_auc_score 0.5085694696797143


In [56]:
class CustomStandardScaler(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.sc = StandardScaler()
    def fit(self,x,y=None):
        return self
    def transform(self,x,y=None):
        return pd.DataFrame(self.sc.fit_transform(x),columns=x.columns)

In [51]:
class FeatureSelector(BaseEstimator,TransformerMixin):
    def __init__(self,selected_features):
        self.selected_features = selected_features
    def fit(self,x,y):
        return self
    def transform(self,x,y=None):
        if not isinstance(x, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame")
        
        missing_cols = [col for col in self.selected_features if col not in x.columns]
        if missing_cols:
            raise ValueError(f"These columns are not in the DataFrame: {missing_cols}")
        
        return x[self.selected_features]


In [57]:
pipe = Pipeline(
    [
        ("CustomLabelEncoder",CustomLabelEncoder(variables=CATEGORICAL_VARIABLES)),
        ("StandardScaler",CustomStandardScaler()),
        ("FeatureSelector",FeatureSelector(selected_features=selected_features)),
        ("Estimator",LogisticRegression(C=0.0005))
    ]
)

In [58]:
pipe

In [59]:
x_train,x_test,y_train,y_test = train_test_split(
    data.drop([target,"Patient_ID"],axis=1),
    data[target],
    test_size=0.2,
    random_state=0
)

In [60]:
pipe.fit(x_train,y_train)

In [61]:
y_preds = pipe.predict(x_test)
acc = accuracy_score(y_test,y_preds)
print(acc)

0.25169061707523244


In [62]:
import pandas as pd
selected_features = pd.read_csv(r"data\selected_features.csv")


In [69]:
selected_features['0'].to_list()

['Age',
 'Smoking_History',
 'Tumor_Size_mm',
 'Tumor_Location',
 'Insurance_Type',
 'Comorbidity_Autoimmune_Disease',
 'Comorbidity_Other',
 'Blood_Pressure_Systolic',
 'Blood_Pressure_Diastolic',
 'White_Blood_Cell_Count',
 'Platelet_Count',
 'Albumin_Level',
 'Calcium_Level',
 'Glucose_Level',
 'Potassium_Level',
 'Sodium_Level',
 'Smoking_Pack_Years']

In [1]:
import joblib

In [3]:
import pandas as pd

In [2]:
pipe = joblib.load(r"C:\Users\omera\Desktop\lung_cancer\models\trained_pipeline.pkl")

In [5]:
data = pd.read_csv(r"data\x_test.csv")

In [6]:
prediction = pipe.predict(data)

In [7]:
print(prediction)

['Stage I' 'Stage I' 'Stage III' ... 'Stage III' 'Stage II' 'Stage I']
