In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
df=pd.read_csv("survey lung cancer.csv")

In [7]:
df.shape

(309, 16)

In [9]:
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [10]:
df.isnull().sum()

Unnamed: 0,0
GENDER,0
AGE,0
SMOKING,0
YELLOW_FINGERS,0
ANXIETY,0
PEER_PRESSURE,0
CHRONIC DISEASE,0
FATIGUE,0
ALLERGY,0
WHEEZING,0


In [11]:
df.nunique()

Unnamed: 0,0
GENDER,2
AGE,39
SMOKING,2
YELLOW_FINGERS,2
ANXIETY,2
PEER_PRESSURE,2
CHRONIC DISEASE,2
FATIGUE,2
ALLERGY,2
WHEEZING,2


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [13]:
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [15]:
df['LUNG_CANCER'].value_counts()

Unnamed: 0_level_0,count
LUNG_CANCER,Unnamed: 1_level_1
YES,270
NO,39


In [16]:
#doing label encoding
object_class=df.select_dtypes(include='object').columns

encoders={}
le=LabelEncoder()
for i in object_class:
  df[i]=le.fit_transform(df[i])
  encoders[i]=le

In [17]:
# outliers
def outlier_remove(df, col):
    Q1 = df[col].quantile(0.25)  # Calculate Q1 for the specific column
    Q3 = df[col].quantile(0.75)  # Calculate Q3 for the specific column
    IQR = Q3 - Q1

    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)


    median=df[col].median()


    df[col]=df[col].apply(lambda x:median if x<lower_bound or x>upper_bound else x)

    return df

In [18]:
numerical_col=df.select_dtypes(exclude='object').columns

In [19]:
numerical_col

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

In [20]:
df['LUNG_CANCER'].value_counts()

Unnamed: 0_level_0,count
LUNG_CANCER,Unnamed: 1_level_1
1,270
0,39


In [21]:
x=df.drop('LUNG_CANCER',axis=1)
y=df['LUNG_CANCER']

In [23]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [24]:
smote=SMOTE(random_state=42)
x_new,y_new=smote.fit_resample(x_train,y_train)

In [25]:
models={
    "decison tree":DecisionTreeClassifier(),
    "random forest":RandomForestClassifier(),
    "logistic regression":LogisticRegression(),
    "svm":SVC()
}

In [26]:
cv_scores={}

#perform 5-fold cross validation for each model
for model_name,model in models.items():
  scores=cross_val_score(model,x_new,y_new,cv=8)
  cv_scores[model_name]=scores
  print(f"{model_name} cross validation accuracy:{np.mean(scores):.2f}")
  print("-"*50 )

decison tree cross validation accuracy:0.92
--------------------------------------------------
random forest cross validation accuracy:0.95
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic regression cross validation accuracy:0.95
--------------------------------------------------
svm cross validation accuracy:0.58
--------------------------------------------------


In [27]:
decison_tree=DecisionTreeClassifier(random_state=42)
random_forest=RandomForestClassifier(random_state=42)
logistic_regression=LogisticRegression(random_state=42)
svm=SVC(random_state=42)

In [28]:
#hyperparameter grids for RandomizedSearchCV

param_grid_dt={
    "criterion":["gini","entropy"],
    "max_depth":[None,5,10,15,20,30],
    "min_samples_split":[2,5,10],
}
param_grid_rf={
    "n_estimators":[100,200,300],
    "criterion":["gini","entropy"],
     "max_depth":[None,5,10,15,20,30],
    "min_samples_split":[2,5,10],
}
param_grid_lr={
    "C":[0.001,0.01,0.1,1,10,100],
    "penalty":["l1","l2"],
    "solver":["liblinear"],
}

param_grid_svc={
    "C":[0.001,0.01,0.1,1,10,100],
    "kernel":["linear","rbf","poly"],
}

In [29]:
#hyperparameter tuning for both models
#the below steps can be autyomated by using a for loop or by using a pipeine
#perform randomizedsearchcv for each model

random_search_dt=RandomizedSearchCV(estimator=decison_tree,param_distributions=param_grid_dt,n_iter=10,cv=5,scoring="accuracy",random_state=42)

random_search_rf=RandomizedSearchCV(estimator=random_forest,param_distributions=param_grid_rf,n_iter=10,cv=5,scoring="accuracy",random_state=42)

random_search_lr=RandomizedSearchCV(estimator=logistic_regression,param_distributions=param_grid_lr,n_iter=10,cv=5,scoring="accuracy",random_state=42)

random_search_svc=RandomizedSearchCV(estimator=svm,param_distributions=param_grid_svc,n_iter=10,cv=5,scoring="accuracy",random_state=42)


In [30]:
random_search_dt.fit(x_new,y_new)
random_search_rf.fit(x_new,y_new)
random_search_lr.fit(x_new,y_new)
random_search_svc.fit(x_new,y_new)

In [31]:
print(random_search_dt.best_params_)
print(random_search_rf.best_params_)

{'min_samples_split': 2, 'max_depth': 5, 'criterion': 'entropy'}
{'n_estimators': 200, 'min_samples_split': 2, 'max_depth': 5, 'criterion': 'gini'}


In [32]:
print(random_search_dt.best_score_)
print(random_search_rf.best_score_)

0.9238095238095239
0.9476190476190476


In [33]:
#get the model with best score

best_model=None
best_score=0

if random_search_dt.best_score_>best_score:
  best_score=random_search_dt.best_score_
  best_model=random_search_dt.best_estimator_

if random_search_rf.best_score_>best_score:
  best_score=random_search_rf.best_score_
  best_model=random_search_rf.best_estimator_

if random_search_svc.best_score_>best_score:
  best_score=random_search_svc.best_score_
  best_model=random_search_svc.best_estimator_

In [34]:
print(best_model)
print(best_score)

SVC(C=10, random_state=42)
0.9547619047619047


In [35]:
#save the best model
with open("Lung Cancer_model.sav","wb") as f:
  pickle.dump(best_model,f)

In [36]:
random_forest.fit(x_new,y_new)

In [37]:
#evaluate on test data
y_test_pred=best_model.predict(x_test)
print("accuracy score\n",accuracy_score(y_test,y_test_pred))
print("confusion matrix\n",confusion_matrix(y_test,y_test_pred))
print("classification report\n",classification_report(y_test,y_test_pred))

accuracy score
 0.9516129032258065
confusion matrix
 [[ 1  1]
 [ 2 58]]
classification report
               precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       0.98      0.97      0.97        60

    accuracy                           0.95        62
   macro avg       0.66      0.73      0.69        62
weighted avg       0.96      0.95      0.96        62



In [39]:
#building predictive model
input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2,4,90)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = random_forest.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person do not have lung cancer')
else:
  print('The Person have lung cancer')

[1]
The Person have lung cancer


