# **Importing Libraries**

In [None]:
import os
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report



# **EXPLORATORY DATA ANALYSIS**

In [None]:
# IMPORTING DATASET
df=pd.read_csv('cc.csv')
print(df)
print('============ Total number of columns====================')
print(df.columns)

     Age Number of sexual partners First sexual intercourse  \
0     18                       4.0                     15.0   
1     15                       1.0                     14.0   
2     34                       1.0                        ?   
3     52                       5.0                     16.0   
4     46                       3.0                     21.0   
..   ...                       ...                      ...   
853   34                       3.0                     18.0   
854   32                       2.0                     19.0   
855   25                       2.0                     17.0   
856   33                       2.0                     24.0   
857   29                       2.0                     20.0   

    Num of pregnancies Smokes Smokes (years) Smokes (packs/year)  \
0                  1.0    0.0            0.0                 0.0   
1                  1.0    0.0            0.0                 0.0   
2                  1.0    0.0          

In [None]:

print('============ Total number of columns====================')
print(df.columns)

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy'],
      dtype='object')


**DESCRIPTIVE STATISTICS**

In [None]:

print('================================== DATA STATISTICS ==========================')
print(df.describe())



              Age  STDs: Number of diagnosis   Dx:Cancer      Dx:CIN  \
count  858.000000                 858.000000  858.000000  858.000000   
mean    26.820513                   0.087413    0.020979    0.010490   
std      8.497948                   0.302545    0.143398    0.101939   
min     13.000000                   0.000000    0.000000    0.000000   
25%     20.000000                   0.000000    0.000000    0.000000   
50%     25.000000                   0.000000    0.000000    0.000000   
75%     32.000000                   0.000000    0.000000    0.000000   
max     84.000000                   3.000000    1.000000    1.000000   

           Dx:HPV          Dx  Hinselmann    Schiller    Citology      Biopsy  
count  858.000000  858.000000  858.000000  858.000000  858.000000  858.000000  
mean     0.020979    0.027972    0.040793    0.086247    0.051282    0.064103  
std      0.143398    0.164989    0.197925    0.280892    0.220701    0.245078  
min      0.000000    0.000000  

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# target variable distribution
print(df.LUNG_CANCER.value_counts())
sns.countplot(x = df['LUNG_CANCER']);
plt.title("Target Distribution")
plt.show()

AttributeError: ignored

#**Checking Null Values**

In [None]:
# checking for null values
df.isnull().sum()

#**Checking Categorical Features**

In [None]:
df.select_dtypes(include=['object']).dtypes

#**Converting Categorical Feature Diagnosis**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
# Encode labels in column 'species'.
df['LUNG_CANCER']= label_encoder.fit_transform(df['LUNG_CANCER'])
df['GENDER']= label_encoder.fit_transform(df['GENDER'])


In [None]:
bg= df[df['LUNG_CANCER']==1]

mg = df[df['LUNG_CANCER']==0]
print(bg.shape,mg.shape)

#**Visualization of Features Distribution**

In [None]:
plt.figure(figsize = (14, 20))
plotnumber = 1

for column in df:
    if plotnumber <= 30:
        ax = plt.subplot(10, 3, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column)

    plotnumber += 1

plt.tight_layout()
plt.show()

#**Checking Features Correlation**

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True , linewidths=1);

#**Feature Selection using Pearson Correlation**

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(df, 0.55)
len(set(corr_features))

In [None]:
corr_features


In [None]:
df.drop(corr_features,axis=1)
df.drop(corr_features,axis=1)

In [None]:
y=df.pop('LUNG_CANCER')

# **HANDLING IMBALANCE DATASET**

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [None]:
# Implementing Oversampling for Handling Imbalanced
smk = SMOTETomek(random_state=42)
xdata,ydata=smk.fit_resample(df,y)

In [None]:
xdata.shape,ydata.shape

In [None]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(xdata)))
print('Resampled dataset shape {}'.format(Counter(ydata)))

#**Train Test Split**

In [None]:
train_X, test_X, train_y, test_y = train_test_split(df,y, test_size=0.3)
print(train_X.shape)
print(test_X.shape)
print(train_y.shape)
print(test_y.shape)

#**Model Training**

In [None]:
# implementing algorthm DT
from sklearn.svm import SVC
SVM = SVC()
ModelSVM = SVM.fit(train_X,train_y)

#**Model Testing**

In [None]:
PredictionSVM = SVM.predict(test_X)

#**Training Accuracy**

In [None]:
# =====================ACCUARACY===========================
print("=====================SVM Training Accuarcy=============")
tracSVM=SVM.score(train_X,train_y)
trainingAccSVM=tracSVM*100
print(trainingAccSVM)


#**Testing Accuracy**

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print("====================SVM Testing Accuracy============")
teacSVM=accuracy_score(test_y,PredictionSVM)
testingAccSVM=teacSVM*100
print(testingAccSVM)
from sklearn.metrics import confusion_matrix
print(classification_report(test_y, PredictionSVM))
confusion_matrix(test_y, PredictionSVM)

In [None]:
#Graphical Representation
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(df)
# plt.show()