## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Read and investigate the data

In [2]:
data = pd.read_csv("../Data/lung_cancer_dataset.csv")
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,M,65,1,1,1,2,2,1,2,2,2,2,2,2,1,NO
1,F,55,1,2,2,1,1,2,2,2,1,1,1,2,2,NO
2,F,78,2,2,1,1,1,2,1,2,1,1,2,1,1,YES
3,M,60,2,1,1,1,2,1,2,1,1,2,1,2,2,YES
4,F,80,1,1,2,1,1,2,1,2,1,1,1,1,2,NO


## Prepare the data

In [3]:
X = data.iloc[:, :-1].drop(["AGE"], axis=1)
X.head()

Unnamed: 0,GENDER,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN
0,M,1,1,1,2,2,1,2,2,2,2,2,2,1
1,F,1,2,2,1,1,2,2,2,1,1,1,2,2
2,F,2,2,1,1,1,2,1,2,1,1,2,1,1
3,M,2,1,1,1,2,1,2,1,1,2,1,2,2
4,F,1,1,2,1,1,2,1,2,1,1,1,1,2


In [4]:
Y = data.iloc[:, -1].values
Y

array(['NO', 'NO', 'YES', ..., 'YES', 'YES', 'YES'], dtype=object)

## Encode the data

In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [6]:
X["GENDER"] = label_encoder.fit_transform(X["GENDER"])
X["GENDER"].head()

0    1
1    0
2    0
3    1
4    0
Name: GENDER, dtype: int64

In [7]:
Y = label_encoder.fit_transform(Y)
Y

array([0, 0, 1, ..., 1, 1, 1])

In [8]:
encode_binary_cols = ["SMOKING", "YELLOW_FINGERS", "ANXIETY", "PEER_PRESSURE", "CHRONIC_DISEASE", "FATIGUE", "ALLERGY", "WHEEZING", "ALCOHOL_CONSUMING", "COUGHING","SHORTNESS_OF_BREATH", "SWALLOWING_DIFFICULTY", "CHEST_PAIN"]

def encode_binary(col):
    X[col] = [1 if element == 2 else 0 for element in X[col]]

for col in encode_binary_cols:
    encode_binary(col)

In [9]:
X.head()

Unnamed: 0,GENDER,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN
0,1,0,0,0,1,1,0,1,1,1,1,1,1,0
1,0,0,1,1,0,0,1,1,1,0,0,0,1,1
2,0,1,1,0,0,0,1,0,1,0,0,1,0,0
3,1,1,0,0,0,1,0,1,0,0,1,0,1,1
4,0,0,0,1,0,0,1,0,1,0,0,0,0,1


## Split the data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((2400, 14), (600, 14))

## Import and train the models

### Logistic Regression Model

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

In [14]:
y_preds_log = logistic_regression_model.predict(X_test)

### SVC (Support Vector Machine Classifier)

In [15]:
from sklearn.svm import SVC

In [16]:
svc = SVC()
svc.fit(X_train, y_train)

In [17]:
y_preds_svc = svc.predict(X_test)

### Naive Bayes

#### Gaussian Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

In [19]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [20]:
y_preds_gnb = gnb.predict(X_test)

#### Bernoulli Naive Bayes

In [21]:
from sklearn.naive_bayes import BernoulliNB

In [22]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

In [23]:
y_preds_bnb = bnb.predict(X_test)

### Decision Tree Model

In [24]:
from sklearn.tree import DecisionTreeClassifier

In [25]:
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, y_train)

In [26]:
y_preds_dtc = decision_tree_classifier.predict(X_test)

### Random Forest Model

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
random_forest_classifier = RandomForestClassifier(max_depth=10)
random_forest_classifier.fit(X_train, y_train)

In [29]:
y_preds_rf = random_forest_classifier.predict(X_test)

## Check the models' accuracy

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
print(f"Logistic regression model's accuracy : {accuracy_score(y_test, y_preds_log):.2f}")
print(f"SVC model's accuracy : {accuracy_score(y_test, y_preds_svc):.2f}")
print(f"GNB model's accuracy : {accuracy_score(y_test, y_preds_gnb):.2f}")
print(f"BNB model's accuracy : {accuracy_score(y_test, y_preds_bnb):.2f}")
print(f"Decision Tree model's accuracy : {accuracy_score(y_test, y_preds_dtc):.2f}")
print(f"Random Forest model's accuracy : {accuracy_score(y_test, y_preds_rf):.2f}")

Logistic regression model's accuracy : 0.51
SVC model's accuracy : 0.49
GNB model's accuracy : 0.51
BNB model's accuracy : 0.51
Decision Tree model's accuracy : 0.49
Random Forest model's accuracy : 0.48
