In [None]:
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import time

#Opening the file and naming the columns
data = pd.read_csv('cancerdata.csv', names = ['ID number', 'Diagnosis', 'Radius Mean','Texture Mean','Perimeter Mean','Area Mean','Smoothness Mean','Compactness Mean','Concavity Mean','Concave Points Mean','Symmetry Mean','Fractal Dimension mean' ,'Radius SE','Texture SE','Perimeter SE','Area SE','Smoothness SE','Compactness SE','Concavity SE','Concave Points SE','Symmetry SE','Fractal Dimension SE' ,'Radius Worst','Texture Worst','Perimeter Worst','Area Worst','Smoothness Worst','Compactness Worst','Concavity Worst','Concave Points Worst','Symmetry Worst','Fractal Dimension Worst' ])

#Preprocessing and data cleaning
data = data.drop('ID number', axis = 1)
data['Diagnosis'].replace(['M','B'],[0,1], inplace = True)
X = data.drop('Diagnosis', axis = 1)
Y = data['Diagnosis']
print(data.isnull().sum())

#Normalizing the data

X = MinMaxScaler().fit_transform(X)

#Splitting the data

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.2)

#Applying logistic regression and calculating evaluation metrics, curve, matrix, and calculating fit time

lr_clf = LogisticRegression()

start = time.time()
lr_clf.fit(x_train,y_train)
end = time.time()
print('time to fit the model:',end-start)


y_pred_lr = lr_clf.predict(x_test)

print('accuracy score is:',accuracy_score(y_test,y_pred_lr))


lr_cm = confusion_matrix(y_test,y_pred_lr)
print(lr_cm)


lbls = ['malignant','benign']

ConfusionMatrixDisplay(lr_cm,display_labels= lbls).plot()

RocCurveDisplay.from_estimator(lr_clf,x_test,y_test)
plt.show()


precision = precision_score(y_test,y_pred_lr)
print('precision is',precision)


f1 = f1_score(y_test, y_pred_lr)
print('f1 score is',f1)


#Applying KNN,grid search and calculating evaluation metrics, curve, matrix and calculating fit time


parameters = {'n_neighbors' : range(1,30), 'weights': ['uniform', 'distance']}

Grid_search = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = parameters,scoring = 'accuracy', return_train_score= True)

Grid_search.fit(x_train,y_train)
print(Grid_search.best_params_)


knn_clf = KNeighborsClassifier(n_neighbors=3)

start = time.time()
knn_clf.fit(x_train,y_train)
end = time.time()
print('time to fit the model:',end-start)


y_pred_knn = knn_clf.predict(x_test)

print('accuracy score is:',accuracy_score(y_test,y_pred_knn))

knn_cm = confusion_matrix(y_test,y_pred_knn)
print(knn_cm)

ConfusionMatrixDisplay(knn_cm,display_labels=lbls).plot()

RocCurveDisplay.from_estimator(knn_clf,x_test,y_test)
plt.show()


precision = precision_score(y_test,y_pred_knn)
print('precision is',precision)


f1 = f1_score(y_test, y_pred_knn)
print('f1 score is',f1)


#Applying random forest, randomized search and calculating evaluation metrics, curve, matrix and calculating fit time

parameters2 = {'n_estimators' : range(1,1000), 'criterion': ['gini','entropy']}


rand_search = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions=parameters2, scoring = 'accuracy', return_train_score=True)
rand_search.fit(x_train, y_train)

print(rand_search.best_params_)

rf_clf = RandomForestClassifier(n_estimators=332, criterion='entropy')

start = time.time()
rf_clf.fit(x_train,y_train)
end = time.time()
print('time to fit the model:',end-start)


y_pred_rf = rf_clf.predict(x_test)
print('accuracy score is:',accuracy_score(y_test,y_pred_rf))

rf_cm = confusion_matrix(y_test, y_pred_rf)
print(rf_cm)

ConfusionMatrixDisplay(rf_cm,display_labels= lbls).plot()
RocCurveDisplay.from_estimator(rf_clf,x_test,y_test)
plt.show()


precision = precision_score(y_test,y_pred_rf)
print('precision is',precision)


f1 = f1_score(y_test, y_pred_rf)
print('f1 score is',f1)
