In [60]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score,confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [61]:
df=pd.read_csv('/Users/omsitapara/Desktop/DiagnoSafe/Dataset/survey lung cancer.csv')

In [62]:
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,0,0,1,0,1,1,0,0,0


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   GENDER                 309 non-null    int64
 1   AGE                    309 non-null    int64
 2   SMOKING                309 non-null    int64
 3   YELLOW_FINGERS         309 non-null    int64
 4   ANXIETY                309 non-null    int64
 5   PEER_PRESSURE          309 non-null    int64
 6   CHRONIC DISEASE        309 non-null    int64
 7   FATIGUE                309 non-null    int64
 8   ALLERGY                309 non-null    int64
 9   WHEEZING               309 non-null    int64
 10  ALCOHOL CONSUMING      309 non-null    int64
 11  COUGHING               309 non-null    int64
 12  SHORTNESS OF BREATH    309 non-null    int64
 13  SWALLOWING DIFFICULTY  309 non-null    int64
 14  CHEST PAIN             309 non-null    int64
 15  LUNG_CANCER            309 non-null    i

In [64]:
df.duplicated().sum()

33

In [65]:
df.drop_duplicates(inplace=True)

In [66]:
df.shape

(276, 16)

In [67]:
df.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [68]:
X=df.drop(columns='LUNG_CANCER',axis=1)
Y=df['LUNG_CANCER']

In [69]:
oversample = RandomOverSampler(sampling_strategy='minority')
smote = SMOTE(sampling_strategy='minority')
X_resampled_over, Y_resampled_over = oversample.fit_resample(X, Y)
X_resampled_smote, Y_resampled_smote = smote.fit_resample(X, Y)
print("SMOTE:", Counter(Y_resampled_smote))

SMOTE: Counter({1: 238, 0: 238})


In [70]:
X_train_resampled, X_test_resampled, Y_train_resampled, Y_test_resampled = train_test_split(X_resampled_smote, Y_resampled_smote, test_size=0.2, random_state=42)

In [71]:
X_train_resampled.shape

(380, 15)

In [72]:
sv=svm.SVC(kernel='linear',probability=True)
sv.fit(X_train_resampled,Y_train_resampled)
XS_train_pred=sv.predict(X_train_resampled)
XS_train_acc=accuracy_score(XS_train_pred,Y_train_resampled)
print("SVC Training Accuracy : ", XS_train_acc*100)
XS_test_pred=sv.predict(X_test_resampled)
XS_test_acc=accuracy_score(XS_test_pred,Y_test_resampled)
print("SVC Testing Accuracy : ", XS_test_acc*100)

SVC Training Accuracy :  95.78947368421052
SVC Testing Accuracy :  93.75


In [73]:
rf=RandomForestClassifier(n_estimators=10)
rf.fit(X_train_resampled,Y_train_resampled)
XR_train_pred=rf.predict(X_train_resampled)
XR_train_acc=accuracy_score(XR_train_pred,Y_train_resampled)
print("RF Training Accuracy : ", XR_train_acc*100)
XR_test_pred=rf.predict(X_test_resampled)
XR_test_acc=accuracy_score(XR_test_pred,Y_test_resampled)
print("RF Testing Accuracy : ", XR_test_acc*100)

RF Training Accuracy :  99.21052631578947
RF Testing Accuracy :  95.83333333333334


In [74]:
lr=LogisticRegression(max_iter=1000)
lr.fit(X_train_resampled,Y_train_resampled)
XL_train_pred=lr.predict(X_train_resampled)
XL_train_acc=accuracy_score(XL_train_pred,Y_train_resampled)
print("LR Training Accuracy : ", XL_train_acc*100)
XL_test_pred=lr.predict(X_test_resampled)
XL_test_acc=accuracy_score(XL_test_pred,Y_test_resampled)
print("LR Testing Accuracy : ", XL_test_acc*100)

LR Training Accuracy :  96.57894736842105
LR Testing Accuracy :  96.875


In [75]:
import pickle

In [76]:
filename='LungCancerTrainedModel.pkl'
pickle.dump(lr,open(filename,'wb'))

In [77]:
lung_model= pickle.load(open('/Users/omsitapara/Desktop/DiagnoSafe/TrainedModel/LungCancerTrainedModel.pkl','rb'))

In [78]:
input_data = (0,68,1,0,1,0,0,1,0,0,0,0,0,0,0)
input_data_array=np.asarray(input_data)
input_data_reshaped = input_data_array.reshape(1,-1)
prediction = lung_model.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
  print('The person is not likely to be Lung Cancer Patient')
else:
  print('The person is likely to be Lung Cancer Patient')

[0]
The person is not likely to be Lung Cancer Patient


