In [190]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [191]:
df = pd.read_csv('risk_factors_cervical_cancer_cleaned.csv')

In [192]:
# Create arrays for the features and the response variable
y = df['Biopsy'].values
X = pd.concat([df['Schiller'], 
               df['Hinselmann'], 
               df['Citology'],
               df['STDs:genital herpes'],
               df['STDs:HIV'],
               df['STDs'],
               df['STDs (number)'],
               df['Hormonal Contraceptives (years)']], axis=1).values


In [200]:
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [201]:
# KNeighborsClassifier

param_grid = {'n_neighbors': np.arange(1,50), 'p':[2, 3]}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)

knn_cv.fit(X_train, y_train)

print(knn_cv.best_params_)
print(knn_cv.best_score_)

{'n_neighbors': 5, 'p': 2}
0.948979591837


In [202]:
# Predict the labels of the test data: y_pred
y_pred = knn_cv.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[155   6]
 [  7   4]]
             precision    recall  f1-score   support

          0       0.96      0.96      0.96       161
          1       0.40      0.36      0.38        11

avg / total       0.92      0.92      0.92       172



In [203]:
# DecisionTreeClassifier

param_grid = {'max_depth':[3,5,8,15,25,30,None],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'min_samples_leaf': [1,2,5,10],
              'min_samples_split': [2,5,10,15,100],
              'criterion': ['gini', 'entropy']}

tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, param_grid, cv=5)

tree_cv.fit(X_train, y_train)

print(tree_cv.best_params_)
print(tree_cv.best_score_)


{'criterion': 'entropy', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10}
0.965014577259


In [204]:
# Predict the labels of the test data: y_pred
y_pred = tree_cv.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[157   4]
 [  2   9]]
             precision    recall  f1-score   support

          0       0.99      0.98      0.98       161
          1       0.69      0.82      0.75        11

avg / total       0.97      0.97      0.97       172



In [207]:
# RandomForestClassifier

param_grid = {"max_depth": [5,8,15,25,30,None], 
              "min_samples_leaf": [1,2,5,10],
              "min_samples_split": [2,5,10,15,100],
              'max_features': ['auto', 'sqrt', 'log2', None],
              "n_estimators": [10,20,100],
              'criterion': ['gini', 'entropy']}
rfc = RandomForestClassifier()
rfc_cv = GridSearchCV(rfc, param_grid, cv=5)

# Fit the classifier to the training data
rfc_cv.fit(X_train, y_train)

print(tree_cv.best_params_)
print(tree_cv.best_score_)


{'criterion': 'entropy', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10}
0.965014577259


In [208]:
# Predict the labels of the test data: y_pred
y_pred = rfc_cv.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[157   4]
 [  3   8]]
             precision    recall  f1-score   support

          0       0.98      0.98      0.98       161
          1       0.67      0.73      0.70        11

avg / total       0.96      0.96      0.96       172



In [209]:
# SVM

param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              "gamma": [0.1, 0.01, 0.001, 0.0001, 'auto'],
              "class_weight": ['balanced', None]}
svm = SVC()
svm_cv = GridSearchCV(svm, param_grid, cv=5)

# Fit the classifier to the training data
svm_cv.fit(X_train, y_train)

# Predict the labels of the test data: y_pred
y_pred = svm_cv.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[157   4]
 [  3   8]]
             precision    recall  f1-score   support

          0       0.98      0.98      0.98       161
          1       0.67      0.73      0.70        11

avg / total       0.96      0.96      0.96       172



In [210]:
# LogisticRegression

param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100,1000],
              'penalty': ['l1', 'l2']}

# Create the classifier: logreg
logreg = LogisticRegression()

logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit the classifier to the training data
logreg_cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg_cv.predict(X_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[158   3]
 [  5   6]]
             precision    recall  f1-score   support

          0       0.97      0.98      0.98       161
          1       0.67      0.55      0.60        11

avg / total       0.95      0.95      0.95       172



In [211]:
# Pipeline_SVM
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Instantiate the GridSearchCV object: cv
cv =  GridSearchCV(pipeline, param_grid=parameters) 

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9534883720930233
             precision    recall  f1-score   support

          0       0.97      0.98      0.98       161
          1       0.67      0.55      0.60        11

avg / total       0.95      0.95      0.95       172

Tuned Model Parameters: {'SVM__C': 1, 'SVM__gamma': 0.01}
[[158   3]
 [  5   6]]
