<a href="https://colab.research.google.com/github/nindob/breast_cancer-detection/blob/master/Science_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Classification of cancer dignosis
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [0]:
#importing the dataset 
#load data
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('data.csv')
df.head(7)

In [0]:
#Count the number of coloumns and rows in the data set
df.shape

In [0]:
#Count the number of empty values, (NaN, na, NAN) in each column
df.isna().sum()

In [0]:
#Drop last coloumn with missing values
df = df.dropna(axis=1)

In [0]:
#Get the new count of the number of rows and columns
df.shape

In [0]:
#Get a count of the number of Malignant (M) or Bengin (B) cells
df['diagnosis'].value_counts()

In [0]:
#Visualize the count
sns.countplot(df['diagnosis'], label='count')

In [0]:
#Create a pair plot
sns.pairplot(df.iloc[:,1:7], hue = 'diagnosis')

In [0]:
#Get the correlation of the columns
df.iloc[:,1:12].corr()

In [0]:
#Visualize the correlation
plt.figure(figsize=(10,10))
sns.heatmap(df.iloc[:,1:12].corr(), annot=True, fmt='.0%')

In [0]:
#Encode the catagorical data values
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
df.iloc[:,1] = labelencoder_Y.fit_transform(df.iloc[:,1].values)

In [0]:
#Split the data into Independent (X) and Dependent (Y) data sets
X = df.iloc[:,2:31].values
Y = df.iloc[:,1].values

In [0]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [0]:
#Scale the data (Feature Scaling)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [0]:
#Create a function for the models
def models(X_train, Y_train):

  #Fitting the Logistic Regression Algorithm to the Training Set
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train, Y_train)
  #95.8 Acuracy

  #Fitting K-NN Algorithm
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  knn.fit(X_train, Y_train)
 

  #Fitting SVM
  from sklearn.svm import SVC
  svm = SVC(kernel = 'linear', random_state = 0)
  svm.fit(X_train, Y_train) 
  

  #Fitting K-SVM
  from sklearn.svm import SVC
  ksvm = SVC(kernel = 'rbf', random_state = 0)
  ksvm.fit(X_train, Y_train)
 

  #Fitting Naive_Bayes
  from sklearn.naive_bayes import GaussianNB
  naive = GaussianNB()
  naive.fit(X_train, Y_train)


  #Fitting Decision Tree Algorithm
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, Y_train)
  

  #Fitting Random Forest Classification Algorithm
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  forest.fit(X_train, Y_train)
  

  #Print the models accuracy on the training data
  print('[0]Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
  print('[1]K-Nearest Neighbours Classifier Training Accuracy:', knn.score(X_train, Y_train))
  print('[2]Support Vector Machine Classifier Training Accuracy:', svm.score(X_train, Y_train))
  print('[3]K-Support Vector Machine Classifier Training Accuracy:', ksvm.score(X_train, Y_train))
  print('[4]Naive Bayes Training Accuracy:', naive.score(X_train, Y_train))
  print('[5]Decision Tree Training Accuracy:', tree.score(X_train, Y_train))
  print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))

  return log, knn, svm, ksvm, naive, tree, forest

In [0]:
#Getting all of the models
model = models(X_train, Y_train)

In [0]:
#Test model accuracy on test data on confusion matrix
from sklearn.metrics import confusion_matrix

for i in range(len(model)):
  print('Model', i)
  cm = confusion_matrix(Y_test, model[i].predict(X_test))

  TP = cm[0][0]
  TN = cm[1][1]
  FN = cm[1][0]
  FP = cm[0][1]

  print(cm)
  print('Testing Accuracy =', (TP + TN)/ (TP + TN + FN + FP))
  print()

In [0]:
#Show another way to get metrics of the models
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range(len(model)):
  print('Model', i)
  print(classification_report(Y_test, model[i].predict(X_test)))
  print(accuracy_score(Y_test, model[i].predict(X_test)))
  print()

In [0]:
#Print the prediction of Random Forest Classifier Model
pred = model[3].predict(X_test)
print(pred)
print()
print(Y_test)