# Binary Classifier for Lung Cancer Prediction

Title is self-explanatory. Will work on building various models and comparing results.

In [2]:
#Importing standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
#Loading in the data
df = pd.read_csv("lung_cancer_prediction.csv")
df.head()

Unnamed: 0,Country,Age,Gender,Smoking_Status,Second_Hand_Smoke,Air_Pollution_Exposure,Occupation_Exposure,Rural_or_Urban,Socioeconomic_Status,Healthcare_Access,...,Treatment_Access,Clinical_Trial_Access,Language_Barrier,Mortality_Risk,5_Year_Survival_Probability,Delay_in_Diagnosis,Family_History,Indoor_Smoke_Exposure,Tobacco_Marketing_Exposure,Final_Prediction
0,Russia,82,Male,Former Smoker,Yes,Medium,No,Urban,High,Limited,...,Partial,Yes,No,0.263278,0.797576,Yes,No,No,No,No
1,Thailand,66,Female,Former Smoker,No,High,No,Rural,Middle,Good,...,Partial,Yes,No,0.154697,0.336674,Yes,No,No,Yes,No
2,Colombia,87,Male,Former Smoker,No,Medium,No,Urban,Low,Poor,...,Partial,Yes,No,0.607435,0.063621,No,No,No,Yes,No
3,Egypt,51,Female,Former Smoker,No,Low,Yes,Rural,High,Good,...,Full,No,No,0.081184,0.336814,No,No,No,Yes,No
4,DR Congo,43,Male,Former Smoker,No,High,No,Urban,Middle,Good,...,Full,No,No,0.643276,0.820103,No,No,No,No,No


In [4]:
#Getting a sense of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460292 entries, 0 to 460291
Data columns (total 25 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Country                      460292 non-null  object 
 1   Age                          460292 non-null  int64  
 2   Gender                       460292 non-null  object 
 3   Smoking_Status               460292 non-null  object 
 4   Second_Hand_Smoke            460292 non-null  object 
 5   Air_Pollution_Exposure       460292 non-null  object 
 6   Occupation_Exposure          460292 non-null  object 
 7   Rural_or_Urban               460292 non-null  object 
 8   Socioeconomic_Status         460292 non-null  object 
 9   Healthcare_Access            460292 non-null  object 
 10  Insurance_Coverage           460292 non-null  object 
 11  Screening_Availability       460292 non-null  object 
 12  Stage_at_Diagnosis           460292 non-null  object 
 13 

In [5]:
# Basic summary statistics for numerical attribtues
df.describe()

Unnamed: 0,Age,Mortality_Risk,5_Year_Survival_Probability
count,460292.0,460292.0,460292.0
mean,60.044183,0.500113,0.499866
std,17.590029,0.259828,0.260258
min,30.0,0.050002,0.050001
25%,45.0,0.274948,0.274252
50%,60.0,0.500361,0.499002
75%,75.0,0.725412,0.726071
max,90.0,0.949997,0.949998


## Data Cleaning

In [6]:
#Checking 
df.isnull().sum()

Country                             0
Age                                 0
Gender                              0
Smoking_Status                      0
Second_Hand_Smoke                   0
Air_Pollution_Exposure              0
Occupation_Exposure                 0
Rural_or_Urban                      0
Socioeconomic_Status                0
Healthcare_Access                   0
Insurance_Coverage                  0
Screening_Availability              0
Stage_at_Diagnosis                  0
Cancer_Type                         0
Mutation_Type                  138173
Treatment_Access                46053
Clinical_Trial_Access               0
Language_Barrier                    0
Mortality_Risk                      0
5_Year_Survival_Probability         0
Delay_in_Diagnosis                  0
Family_History                      0
Indoor_Smoke_Exposure               0
Tobacco_Marketing_Exposure          0
Final_Prediction                    0
dtype: int64

Data looks very clean. This is to be expected since its a kaggle dataset. However we do have some missing values in Mutation_Type and Treatment_Access. So let's look into that.

### Mutation_Type Cleaning

In [7]:
df['Mutation_Type'].value_counts(dropna=False)

Mutation_Type
NaN     138173
EGFR    138043
KRAS     92108
ALK      91968
Name: count, dtype: int64

It looks like these mutations are what causes the cancerous growth. There are way too many NaN values for my liking. Imputing these values with mean/median values doesn't seem to be right. We could lose and make up information, which would harm our prediction. So let's create a new category and call it unknown.

In [8]:
df['Mutation_Type'] = df['Mutation_Type'].fillna('Unknown')
df['Mutation_Type'].value_counts(dropna=False)

Mutation_Type
Unknown    138173
EGFR       138043
KRAS        92108
ALK         91968
Name: count, dtype: int64

### Treatment_Access Cleaning

In [9]:
df['Treatment_Access'].value_counts(dropna=False)

Treatment_Access
Full       276465
Partial    137774
NaN         46053
Name: count, dtype: int64

There are still alot of missing values. But relatively less. we don't have enough information to impute the value and this would skew our data negatively. So we can proceed similarly as above.

In [10]:
df['Treatment_Access'] = df['Treatment_Access'].fillna('Unknown')
df['Treatment_Access'].value_counts(dropna=False)

Treatment_Access
Full       276465
Partial    137774
Unknown     46053
Name: count, dtype: int64

### Data Encoding

We'll start with transforming categorical variables to numerical variables.

In [11]:
categorical_cols = df.select_dtypes(include=['object']).columns

For categorical columns with Binary values, we can simply replace with 1 and 0.

In [12]:
binary_cols = [col for col in categorical_cols if df[col].nunique() == 2]
for col in binary_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

For multi-class columns, we can use one-hot encoding.

In [13]:
multi_class_cols = [col for col in categorical_cols if df[col].nunique() > 2]
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_array = encoder.fit_transform(df[multi_class_cols])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(multi_class_cols))
df_final = df.drop(columns=multi_class_cols).reset_index(drop=True).join(encoded_df)
df_final.head()

Unnamed: 0,Age,Gender,Second_Hand_Smoke,Occupation_Exposure,Rural_or_Urban,Insurance_Coverage,Screening_Availability,Cancer_Type,Clinical_Trial_Access,Language_Barrier,...,Healthcare_Access_Limited,Healthcare_Access_Poor,Stage_at_Diagnosis_II,Stage_at_Diagnosis_III,Stage_at_Diagnosis_IV,Mutation_Type_EGFR,Mutation_Type_KRAS,Mutation_Type_Unknown,Treatment_Access_Partial,Treatment_Access_Unknown
0,82,1,1,0,1,1,1,0,1,0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,66,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,87,1,0,0,1,0,1,0,1,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,51,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,43,1,0,0,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


Now we need to apply min-max scaling to Age.

In [14]:
scaler = MinMaxScaler()
df_final['Age_scaled'] = scaler.fit_transform(df_final[['Age']])
df_final.head()

Unnamed: 0,Age,Gender,Second_Hand_Smoke,Occupation_Exposure,Rural_or_Urban,Insurance_Coverage,Screening_Availability,Cancer_Type,Clinical_Trial_Access,Language_Barrier,...,Healthcare_Access_Poor,Stage_at_Diagnosis_II,Stage_at_Diagnosis_III,Stage_at_Diagnosis_IV,Mutation_Type_EGFR,Mutation_Type_KRAS,Mutation_Type_Unknown,Treatment_Access_Partial,Treatment_Access_Unknown,Age_scaled
0,82,1,1,0,1,1,1,0,1,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.866667
1,66,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.6
2,87,1,0,0,1,0,1,0,1,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.95
3,51,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.35
4,43,1,0,0,1,1,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.216667


## Train/Test Split

In [15]:
X = df_final.drop(columns=['Final_Prediction'])  # Features
y = df_final['Final_Prediction']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Importing LR model

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score

Training model

In [19]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Makign predictions and evaluations

In [20]:
y_pred = logreg_model.predict(X_test)
y_pred_prob = logreg_model.predict_proba(X_test)[:, 1]

# 1. Confusion Matrix and Accuracy
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# 2. Precision, Recall, F1-Score (provides a comprehensive table)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 3. ROC AUC (requires the probability score)
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"\nROC AUC Score: {roc_auc:.4f}")

Confusion Matrix:
 [[73698     0]
 [18361     0]]
Accuracy: 0.80055182002846

Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89     73698
           1       0.00      0.00      0.00     18361

    accuracy                           0.80     92059
   macro avg       0.40      0.50      0.44     92059
weighted avg       0.64      0.80      0.71     92059


ROC AUC Score: 0.5004


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
