In [1]:
import numpy as np
import scipy.stats as s
import pandas as pd

In [45]:
# Dataset has features which helps us in predicting if a person should take the exam for testing cervical cancer or not
dataFrame = pd.read_csv("DataSets/cervical_cancer.csv")
dataFrame.head(10)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0
5,42,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
6,51,3.0,17.0,6.0,1.0,34.0,3.4,0.0,0.0,1.0,...,?,?,0,0,0,0,1,1,0,1
7,26,1.0,26.0,3.0,0.0,0.0,0.0,1.0,2.0,1.0,...,?,?,0,0,0,0,0,0,0,0
8,45,1.0,20.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,1,0,1,1,0,0,0,0
9,44,3.0,15.0,?,1.0,1.266972909,2.8,0.0,0.0,?,...,?,?,0,0,0,0,0,0,0,0


In [42]:
dataFrame.drop(['STDs: Time since first diagnosis','STDs: Time since last diagnosis'],inplace=True , axis=1) # since these two columns are empty

In [40]:
# check if there exists any columns with null values
data = pd.DataFrame(dataFrame[dataFrame[dataFrame.columns] != '?']) # removing all values having '?' as their value and replacing them with NaN values
data.isna().sum() # calculate how many null values are their for each column

Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

In [5]:
# check the datatype of the values in columns
data.dtypes

Age                                    int64
Number of sexual partners             object
First sexual intercourse              object
Num of pregnancies                    object
Smokes                                object
Smokes (years)                        object
Smokes (packs/year)                   object
Hormonal Contraceptives               object
Hormonal Contraceptives (years)       object
IUD                                   object
IUD (years)                           object
STDs                                  object
STDs (number)                         object
STDs:condylomatosis                   object
STDs:cervical condylomatosis          object
STDs:vaginal condylomatosis           object
STDs:vulvo-perineal condylomatosis    object
STDs:syphilis                         object
STDs:pelvic inflammatory disease      object
STDs:genital herpes                   object
STDs:molluscum contagiosum            object
STDs:AIDS                             object
STDs:HIV  

In [46]:
# convert all the data to numeric data
data = data.convert_objects(convert_numeric=True)
data.isnull().sum() 

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

In [8]:
# replace all the null values of column with mean of that column
data.fillna(data.mean(),inplace=True)
data.isnull().sum()

Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


In [9]:
# Length of each class data
examsShouldBeDoneCount = len(data[data.Biopsy == 1 ])
examsShouldNotBeDoneCount = len(data[data.Biopsy == 0 ])
print("Exams Should Be Done ",examsShouldBeDoneCount)
print("Exams Should Not Be Done ",examsShouldNotBeDoneCount)

Exams Should Be Done  55
Exams Should Not Be Done  803


In [10]:
# length of training data 
trainingDataRowNumbers = int(0.11*len(data))
trainingDataRowNumbers

94

In [11]:
# Taking equal amount of both classes
examShouldBeDoneTrainingData = pd.DataFrame(data[data.Biopsy == 1 ]).iloc[0:47]
examShouldNotBeDoneTrainingData = pd.DataFrame(data[data.Biopsy == 0]).iloc[0:47]
print("Exams should be done training data ",examShouldBeDoneTrainingData.shape)
print("Exams should not be done training data ",examShouldNotBeDoneTrainingData.shape)

Exams should be done training data  (47, 34)
Exams should not be done training data  (47, 34)


In [12]:
# merge all the data 
trainingData = examShouldBeDoneTrainingData.append(examShouldNotBeDoneTrainingData,ignore_index=True)
trainingData.shape

(94, 34)

In [13]:
# drop the class column to be predicted
trainingData.drop(["Biopsy"],inplace=True,axis=1)

In [25]:
trainingData.columns

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology'],
      dtype='object')

In [15]:
# calculate the prior probablity (which should be equal so that they can be cancelled out)
priorProbExamShouldBeDone = len(examShouldBeDoneTrainingData)/trainingDataRowNumbers
priorProbExamShouldNotBeDone = len(examShouldNotBeDoneTrainingData)/trainingDataRowNumbers
print("Prior probablity should be done ",priorProbExamShouldBeDone)
print("Prior probablity should not be done ",priorProbExamShouldNotBeDone)

Prior probablity should be done  0.5
Prior probablity should not be done  0.5


In [16]:
# Means of each class columns
meanExamShouldBeDone = examShouldBeDoneTrainingData.mean()
meanExamShouldNotBeDone = examShouldNotBeDoneTrainingData.mean()
print("Should be done mean ",meanExamShouldBeDone.shape)
print("Should not be done mean ",meanExamShouldNotBeDone.shape)

Should be done mean  (34,)
Should not be done mean  (34,)


In [17]:
# Covariance matrix of each class 
covMatrixExamShouldBeDone = examShouldBeDoneTrainingData.cov()
covMatruxExamShouldNotBeDone = examShouldNotBeDoneTrainingData.cov()
print("should be done",covMatrixExamShouldBeDone.shape)
print("should not be done",covMatruxExamShouldNotBeDone.shape)

should be done (34, 34)
should not be done (34, 34)


In [18]:
# Calculating the determinant
print("Det should be done ",np.linalg.det(np.array(covMatrixExamShouldBeDone)))
print("Det should not be done ",np.linalg.det(np.array(covMatruxExamShouldNotBeDone)))

Det should be done  0.0
Det should not be done  0.0


In [19]:
# Calculating the pooled covariance matrix
dfShouldBeDone = len(examShouldBeDoneTrainingData) - 1
dfShouldNotBeDone = len(examShouldNotBeDoneTrainingData) - 1
pooledCovMatrix = (dfShouldBeDone * covMatrixExamShouldBeDone + dfShouldNotBeDone * covMatruxExamShouldNotBeDone ) / dfShouldBeDone + dfShouldNotBeDone
print("Pooled Cov Matrix",pooledCovMatrix.shape)

Pooled Cov Matrix (34, 34)


In [20]:
# Again caclculating the determinant
np.linalg.det(pooledCovMatrix)

0.0

In [34]:
# Using regularized analysis approach (RDA)
alpha = 0.8
regularizedCovMatrix = alpha * pooledCovMatrix + (1-alpha) * pooledCovMatrix
print("Resularized Cov Matrix ",regularizedCovMatrix.shape)

Resularized Cov Matrix  (34, 34)


In [35]:
# Again calculating determinant of RDA extracted matrix 
np.linalg.det(regularizedCovMatrix)

0.0

In [23]:
# Applying PCA
covData = data.cov() # Taking the covariance of full data Frame
a , b, c = np.linalg.svd(covData) # Applying single value decomposition
print("A",a.shape) # Orthogonal matrix
print("B" ,b.shape)# eigen value matrix containing variances in decreasing order for each column
print("C" , c.shape)# trancpose of orthogonal matrix

A (34, 34)
B (34,)
C (34, 34)


In [24]:
a

array([[-9.69643268e-01,  1.14971800e-01,  1.01415100e-01, ...,
        -2.56827077e-17,  3.03291621e-18, -7.07962901e-20],
       [-1.73613444e-02, -8.01420372e-02, -1.24704458e-02, ...,
         1.77896840e-17,  3.04764888e-18,  3.74289199e-19],
       [-1.22188097e-01,  1.64385461e-01,  2.14604165e-01, ...,
         2.38542165e-17, -5.51454152e-18,  2.37693968e-19],
       ...,
       [-3.52563991e-03, -3.74620483e-03, -4.14952449e-03, ...,
        -2.21513483e-16,  1.84242872e-17, -2.99936020e-17],
       [ 3.30564851e-04,  3.87204039e-05, -5.24930548e-03, ...,
        -8.74848446e-16, -8.25177275e-17,  1.52103621e-18],
       [-1.74554724e-03, -2.36004066e-03, -4.53510197e-03, ...,
         5.24331228e-16,  9.13990519e-17,  2.32205089e-17]])