In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 29 17:01:17 2024

@author: bhuvaneshd2
"""

#Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import sympy as sp#For ignoring warning
from sympy import symbols
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings("ignore")
 

In [2]:
df=pd.read_csv('survey lung cancer.csv')
print(df)
print(df.shape)
#Checking for Duplicates
print(df.duplicated().sum())
#Removing Duplicates
df=df.drop_duplicates()

df.info()
df.describe()

    GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0        M   69        1               2        2              1   
1        M   74        2               1        1              1   
2        F   59        1               1        1              2   
3        M   63        2               2        2              1   
4        F   63        1               2        1              1   
..     ...  ...      ...             ...      ...            ...   
304      F   56        1               1        1              2   
305      M   70        2               1        1              1   
306      M   58        2               1        1              1   
307      M   67        2               1        2              1   
308      M   62        1               1        1              2   

     CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  \
0                  1         2         1         2                  2   
1                  2         2       

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
count,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0
mean,62.90942,1.543478,1.576087,1.496377,1.507246,1.521739,1.663043,1.547101,1.547101,1.550725,1.576087,1.630435,1.467391,1.557971
std,8.379355,0.499011,0.495075,0.500895,0.500856,0.500435,0.473529,0.498681,0.498681,0.498324,0.495075,0.483564,0.499842,0.49753
min,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,57.75,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,62.5,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
75%,69.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,87.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [3]:
le=preprocessing.LabelEncoder()
df['GENDER']=le.fit_transform(df['GENDER'])
df['LUNG_CANCER']=le.fit_transform(df['LUNG_CANCER'])
df['SMOKING']=le.fit_transform(df['SMOKING'])
df['YELLOW_FINGERS']=le.fit_transform(df['YELLOW_FINGERS'])
df['ANXIETY']=le.fit_transform(df['ANXIETY'])
df['PEER_PRESSURE']=le.fit_transform(df['PEER_PRESSURE'])
df['CHRONIC DISEASE']=le.fit_transform(df['CHRONIC DISEASE'])
df['FATIGUE ']=le.fit_transform(df['FATIGUE '])
df['ALLERGY ']=le.fit_transform(df['ALLERGY '])
df['WHEEZING']=le.fit_transform(df['WHEEZING'])
df['ALCOHOL CONSUMING']=le.fit_transform(df['ALCOHOL CONSUMING'])
df['COUGHING']=le.fit_transform(df['COUGHING'])
df['SHORTNESS OF BREATH']=le.fit_transform(df['SHORTNESS OF BREATH'])
df['SWALLOWING DIFFICULTY']=le.fit_transform(df['SWALLOWING DIFFICULTY'])
df['CHEST PAIN']=le.fit_transform(df['CHEST PAIN'])
df['LUNG_CANCER']=le.fit_transform(df['LUNG_CANCER'])

In [4]:
#logistic regression considering Gender and age manual method
iterations = 10000
beta0=0
beta1=0
beta2=0
alpha=0.0001

# Performing gradient descent
for i in range(iterations):
    # Calculate probabilities.Here p represents the sigmoid function
    p = 1 / (1 + np.exp(-(beta0 + beta1 * df['GENDER'] + beta2 * df['AGE'])))

    # Updating co-effs.Here only Gender and age are considered to determine the lung cancer existence 
    beta0 -= alpha * np.sum(p - df['LUNG_CANCER'])
    beta1 -= alpha * np.sum((p - df['LUNG_CANCER']) * df['GENDER'])
    beta2 -= alpha * np.sum((p - df['LUNG_CANCER']) * df['AGE'])

# Print the coefficients
print("Estimated Coefficients:")
print(f"Intercept:(beta0) {beta0}")
print(f"Gender Coefficient:(beta1) {beta1}")
print(f"Age Coefficient:(beta2) {beta2}")

Estimated Coefficients:
Intercept:(beta0) -1.281909823237413
Gender Coefficient:(beta1) 0.9555802330902184
Age Coefficient:(beta2) -0.08943026162184564


In [5]:
x, y = sp.symbols('x y')


# logistic regression equation is deduced as
logistic_eq = beta0 + beta1 * x + beta2 * y

# Converting the equation to a function that can be evaluated numerically
logistic_func = sp.lambdify((x, y), 1 / (1 + sp.exp(-logistic_eq)), 'numpy')


g = int(input("Enter Gender(0-M/1-F): "))
a = int(input("Enter Age: "))


prob = logistic_func(g, a)
print(f"Probability of having lung cancer at the age {a} being a {'Male' if g == 0 else 'Female'} is {prob}")


Probability of having lung cancer at the age 55 being a Female is 0.005246185831326581


In [6]:



# Split the dataset into training and testing sets
X = df[['GENDER', 'AGE']]
y = df['LUNG_CANCER']
#random state=0 splits the testing data and training the same each time the code is executed ! randomness can be set by keeping it as 1
#this stmt is used to split testing data and training data,here 20% of data is used for testing remaining 80% for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Perform logistic regression
log_reg = LogisticRegression()#this stament creates a logistic regression class from sckit-learn,this var
#can be used to train the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = (y_pred == y_test).mean()
print("Accuracy:", accuracy)

input_features = [[1, 45]]  # Female (1), Age: 45

# Make prediction
prediction = log_reg.predict(input_features)

# Get the probability of the prediction
probability = log_reg.predict_proba(input_features)

# Print the prediction and probability
if prediction[0] == 1:
    print("Prediction: YES")
else:
    print("Prediction: NO")

print(f"Probability: {probability[0][1]:.4f}")




Accuracy: 0.8392857142857143
Prediction: YES
Probability: 0.8081


In [7]:
#DOES THE CORRELATION between each factor

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Print the correlation matrix
print(correlation_matrix)

                         GENDER       AGE   SMOKING  YELLOW_FINGERS   ANXIETY  \
GENDER                 1.000000 -0.013120  0.041131       -0.202506 -0.152032   
AGE                   -0.013120  1.000000 -0.073410        0.025773  0.050605   
SMOKING                0.041131 -0.073410  1.000000       -0.020799  0.153389   
YELLOW_FINGERS        -0.202506  0.025773 -0.020799        1.000000  0.558344   
ANXIETY               -0.152032  0.050605  0.153389        0.558344  1.000000   
PEER_PRESSURE         -0.261427  0.037848 -0.030364        0.313067  0.210278   
CHRONIC DISEASE       -0.189925 -0.003431 -0.149415        0.015316 -0.006938   
FATIGUE               -0.079020  0.021606 -0.037803       -0.099644 -0.181474   
ALLERGY                0.150174  0.037139 -0.030179       -0.147130 -0.159451   
WHEEZING               0.121047  0.052803 -0.147081       -0.058756 -0.174009   
ALCOHOL CONSUMING      0.434264  0.052049 -0.052771       -0.273643 -0.152228   
COUGHING               0.120