In [37]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [38]:
lung_cancer_dataset= pd.read_csv('lung_cancer.csv')

In [39]:
lung_cancer_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 3000 non-null   object
 1   AGE                    3000 non-null   int64 
 2   SMOKING                3000 non-null   object
 3   YELLOW_FINGERS         3000 non-null   object
 4   ANXIETY                3000 non-null   object
 5   PEER_PRESSURE          3000 non-null   object
 6   CHRONIC_DISEASE        3000 non-null   object
 7   FATIGUE                3000 non-null   object
 8   ALLERGY                3000 non-null   object
 9   WHEEZING               3000 non-null   object
 10  ALCOHOL_CONSUMING      3000 non-null   object
 11  COUGHING               3000 non-null   object
 12  SHORTNESS_OF_BREATH    3000 non-null   object
 13  SWALLOWING_DIFFICULTY  3000 non-null   object
 14  CHEST_PAIN             3000 non-null   object
 15  LUNG_CANCER          

In [40]:
lung_cancer_dataset.head(5)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,M,65,Yes,Yes,Yes,No,No,Yes,No,No,No,No,No,No,Yes,NO
1,F,55,Yes,No,No,Yes,Yes,No,No,No,Yes,Yes,Yes,No,No,NO
2,F,78,No,No,Yes,Yes,Yes,No,Yes,No,Yes,Yes,No,Yes,Yes,YES
3,M,60,No,Yes,Yes,Yes,No,Yes,No,Yes,Yes,No,Yes,No,No,YES
4,F,80,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,No,NO


In [41]:
lung_cancer_dataset.describe()

Unnamed: 0,AGE
count,3000.0
mean,55.169
std,14.723746
min,30.0
25%,42.0
50%,55.0
75%,68.0
max,80.0


In [42]:
lung_cancer_dataset.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC_DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL_CONSUMING        0
COUGHING                 0
SHORTNESS_OF_BREATH      0
SWALLOWING_DIFFICULTY    0
CHEST_PAIN               0
LUNG_CANCER              0
dtype: int64

In [43]:
lung_cancer_dataset.shape

(3000, 16)

In [44]:
label = "LUNG_CANCER"
features = [element for element in lung_cancer_dataset.columns if element != label]
categorical_features = [element for element in features if element != "AGE"]

In [45]:
data = lung_cancer_dataset[features]
target = lung_cancer_dataset[label]

In [46]:
my_label_enc = LabelEncoder()
target = my_label_enc.fit_transform(target)

In [47]:
my_ord_enc = OrdinalEncoder()
data[categorical_features] = my_ord_enc.fit_transform(data[categorical_features])
my_minmax_scaler = MinMaxScaler()
data[["GENDER", "AGE"]] = my_minmax_scaler.fit_transform(data[["GENDER", "AGE"]])
data.head(10)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN
0,1.0,0.7,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.5,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2,0.0,0.96,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
3,1.0,0.6,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
5,0.0,0.56,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
6,0.0,0.8,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,0.88,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
8,1.0,0.94,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
9,0.0,0.74,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [48]:
data["CHEST_PAIN"] = (data["CHEST_PAIN"]+1)**(data["SMOKING"]+1)
data["COUGHING"] == (data["COUGHING"]+1)**(data["SMOKING"]+2)
data["SHORTNESS_OF_BREATH"] == (data["SHORTNESS_OF_BREATH"]+1)**(data["SMOKING"]+4)
data["FATIGUE"] == (data["FATIGUE"]+1)**(data["SMOKING"]+data["COUGHING"]+5)
data["COUGHING"] == (data["COUGHING"]+1)**(data["SMOKING"]+data["FATIGUE"]+data["COUGHING"]+data["CHRONIC_DISEASE"]*3)
data["WHEEZING"] == (data["WHEEZING"]**data["SMOKING"]+data["FATIGUE"]+1)
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.25, random_state=42)
data_train.shape, data_test.shape, target_train.shape, target_test.shape

((2250, 15), (750, 15), (2250,), (750,))