## Import the modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading the dataset

data = pd.read_csv('lung_cancer.csv')
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [3]:
# Mapping the Gender values Male = 1, Female = 0
# Mapping the Lung Cancer values Yes = 1, No = 0

data['GENDER']= data['GENDER'].map({'M':1, 'F':0})
data['LUNG_CANCER']= data['LUNG_CANCER'].map({'YES':1, 'NO':0})

In [4]:
# Chaning the data like 2 = 1 & 1 = 0, for better understanding 1 is affected & 0 is not.

data["SMOKING"].replace({2: 1, 1: 0}, inplace=True)
data["YELLOW_FINGERS"].replace({2: 1, 1: 0}, inplace=True)
data["ANXIETY"].replace({2: 1, 1: 0}, inplace=True)
data["PEER_PRESSURE"].replace({2: 1, 1: 0}, inplace=True)
data["CHRONIC DISEASE"].replace({2: 1, 1: 0}, inplace=True)
data["FATIGUE "].replace({2: 1, 1: 0}, inplace=True)
data["ALLERGY "].replace({2: 1, 1: 0}, inplace=True)
data["WHEEZING"].replace({2: 1, 1: 0}, inplace=True)
data["ALCOHOL CONSUMING"].replace({2: 1, 1: 0}, inplace=True)
data["COUGHING"].replace({2: 1, 1: 0}, inplace=True)
data["SHORTNESS OF BREATH"].replace({2: 1, 1: 0}, inplace=True)
data["SWALLOWING DIFFICULTY"].replace({2: 1, 1: 0}, inplace=True)
data["CHEST PAIN"].replace({2: 1, 1: 0}, inplace=True)

# Renaming the columns which has spaces in between

data.rename(columns={"CHRONIC DISEASE": "CHRONIC_DISEASE",
                   "ALCOHOL CONSUMING": 'ALCOHOL_CONSUMPTION',
                    "SHORTNESS OF BREATH": "SHORTNESS_OF_BREATH",
                    "CHEST PAIN":"CHEST_PAIN",
                    "SWALLOWING DIFFICULTY": "SWALLOWING_DIFFICULTY"},
          inplace=True)

data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMPTION,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,1,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,0,0,1,0,1,1,0,0,0


In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GENDER,309.0,0.524272,0.500221,0.0,0.0,1.0,1.0,1.0
AGE,309.0,62.673139,8.210301,21.0,57.0,62.0,69.0,87.0
SMOKING,309.0,0.563107,0.496806,0.0,0.0,1.0,1.0,1.0
YELLOW_FINGERS,309.0,0.569579,0.495938,0.0,0.0,1.0,1.0,1.0
ANXIETY,309.0,0.498382,0.500808,0.0,0.0,0.0,1.0,1.0
PEER_PRESSURE,309.0,0.501618,0.500808,0.0,0.0,1.0,1.0,1.0
CHRONIC_DISEASE,309.0,0.504854,0.500787,0.0,0.0,1.0,1.0,1.0
FATIGUE,309.0,0.673139,0.469827,0.0,0.0,1.0,1.0,1.0
ALLERGY,309.0,0.556634,0.497588,0.0,0.0,1.0,1.0,1.0
WHEEZING,309.0,0.556634,0.497588,0.0,0.0,1.0,1.0,1.0


In [6]:
data['LUNG_CANCER'].value_counts()

1    270
0     39
Name: LUNG_CANCER, dtype: int64

In [7]:
data.shape

(309, 16)

In [8]:
x = data.drop(columns = 'LUNG_CANCER', axis = 1)
y = data['LUNG_CANCER']

print(x,'\n\n')
print(y)

     GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0         1   69        0               1        1              0   
1         1   74        1               0        0              0   
2         0   59        0               0        0              1   
3         1   63        1               1        1              0   
4         0   63        0               1        0              0   
..      ...  ...      ...             ...      ...            ...   
304       0   56        0               0        0              1   
305       1   70        1               0        0              0   
306       1   58        1               0        0              0   
307       1   67        1               0        1              0   
308       1   62        0               0        0              1   

     CHRONIC_DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL_CONSUMPTION  \
0                  0         1         0         1                    1   
1                  1 

In [9]:
# split the data into test and train

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
print(x.shape, x_train.shape, x_test.shape)

(309, 15) (216, 15) (93, 15)


## Training ML model 

In [10]:
lr = LogisticRegression(max_iter = 1000)

In [11]:
lr.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

### Accuracy of train data

In [12]:
x_train_prediction = lr.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
print('Accuracy of training data : ', training_data_accuracy)

Accuracy of training data :  0.9398148148148148


### Accuracy of test data

In [13]:
x_test_prediction = lr.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Accuracy of training data : ', test_data_accuracy)

Accuracy of training data :  0.8924731182795699


In [14]:
# Predictive analysis

input_data = (0,59,0,0,0,0,0,0,1,1,0,0,0,0,1)

# change the i/p data to np array
input_data_as_np_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_np_array.reshape(1,-1)

prediction = lr.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print('Chance of having lung cancer is low. Kindly be aware')
else:
    print('Very high chance of having lung cancer. Need immediate checkup !!!!!')

[0]
Chance of having lung cancer is low. Kindly be aware


## Save the trained model

In [15]:
import pickle

In [16]:
filename = 'trained_model.pkl'
pickle.dump(lr, open(filename,'wb'))

In [17]:
# load the saved model

loaded_model = pickle.load(open('trained_model.pkl', 'rb'))

In [18]:
input_data = (0,59,0,0,0,0,0,0,1,1,0,0,0,0,1)

# change the i/p data to np array
input_data_as_np_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_np_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print('Chance of having lung cancer is low. Kindly be aware')
else:
    print('Very high chance of having lung cancer. Need immediate checkup !!!!!')

[0]
Chance of having lung cancer is low. Kindly be aware
