In [46]:
import pandas
import sklearn
from sklearn import svm
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [33]:
#Import dataset as Pandas df
dataset = pandas.read_csv('/Users/milesmarkey/Downloads/kag_risk_factors_cervical_cancer.csv')
dataset.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [35]:
#Data preprocessing - replace all '?' with NaN, then remove all rows containing NaN

dataset.replace('?',np.nan,inplace=True)
droppedDs = dataset.dropna(axis=0,how='any',inplace=False)
print(droppedDs.shape)
#Note the resulting dataset is very small. 

(59, 36)


The first preprocessing step 

In [41]:
#Separate independent variable from dependent variables (Here the variable to be predicted is 'Biopsy')

X = droppedDs.drop('Biopsy',axis=1)
y = droppedDs.Biopsy

In [42]:
#Use scikitlearn's train/test split to randomly split data for training/testing (80/20 split)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [43]:
#Initialize SVM classifier with Linear Kernel function:
Linear_classifier = svm.SVC(kernel='linear')

In [44]:
#Train the classifiers

Linear_classifier.fit(X_train,y_train)

SVC(kernel='linear')

In [50]:
y_pred = Linear_classifier.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test,y_pred)
print(accuracy)
print(confusion_matrix(y_test,y_pred))

0.8333333333333334
[[8 2]
 [0 2]]


We can see here that the model has a reasonably high accuracy, but the sample size is very low because all rows that contained an NaN were dropped from the dataset. Additionally, the data is biased because it only includes patients who have an STD, since all others have an 'NaN' value in their 17th and 18th columns. To address this problem, columns 17 and 18 will be dropped first, then all rows containing NaNs will be dropped. This will leave us with a much larger dataset than we previoulsy had

In [58]:
std_first = dataset['STDs: Time since first diagnosis']
stf_last = dataset['STDs: Time since last diagnosis']
newDs1 = dataset.drop('STDs: Time since first diagnosis',axis=1)
newDs2 = newDs1.drop('STDs: Time since last diagnosis',axis=1)
newDs3 = newDs2.dropna(axis=0,how='any',inplace=False)
print(newDs3.shape)

(668, 34)


In [59]:
#Separate independent/dependent variable of new dataset
X2 = newDs3.drop('Biopsy',axis=1)
y2 = newDs3.Biopsy

In [63]:
#Train/test split
X_train,X_test,y_train,y_test = train_test_split(X2,y2,test_size=0.2)

In [64]:
Linear_classifier = svm.SVC(kernel='linear')
Linear_classifier.fit(X_train,y_train)

SVC(kernel='linear')

In [65]:
y_pred = Linear_classifier.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test,y_pred)
print(accuracy)
print(confusion_matrix(y_test,y_pred))

0.9552238805970149
[[121   5]
 [  1   7]]


Not only has the accuracy improved significantly, but the model is now trained/tested on a much larger dataset, increasing the significance of the results. The final step will be to use the full dataset. This will be done by replacing all remaining NaN values with sensible values. While these values will not represent actual data points, they will allow the patients to be included in the model training. This will be done column by column

In [69]:
age = dataset.Age
test = age.dropna(axis=0,how='any',inplace=False)
print(age.shape)
print(test.shape)
#All patients have an age. This will be used to approximate other values for these patients

(858,)
(858,)


In [87]:
columnsMissingData = []
for (columnName,columnData) in newDs2.iteritems():
    varNonNanData = columnData.dropna(axis=0,how='any',inplace=False)
    if not (len(varNonNanData) == len(newDs2)):
        columnsMissingData.append(columnName)
print(columnsMissingData)

['Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV']


This shows a list of the rows that contain at least 1 NaN value. 