In [1]:
import re
from pathlib import Path
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import os
from glob import glob

In [2]:
# Adding path of all the images together 
imagePatches = glob('./Breast Cancer/**/**.png', recursive=True)

### Read image pathes and split classes and axis 

In [3]:
xval = []
yval = []
targetval = []

for filename in imagePatches[:]:
    x , y , target = filename.split('\\')[-1].split('_')[2:5]
    x_coor = x.split('x')[1]
    xval.append(x_coor)
    y_coor = y.split('y')[1]
    yval.append(y_coor)
    target = target.split('.')[0].split('class')[1]
    targetval.append(target)

## Exploratory Data Analysis (EDA)

### Analytics:
How many patches do we have on average per patient?

On average how many of those patches are cancerous vs non-cancerous?

Let's try and answer those questions by querying our dataframe.

In [4]:
df = {'x' : xval ,'y' : yval, 'target' : targetval}
data = pd.DataFrame(df)

In [5]:
data.info()
data.head()
data.iloc[:, :]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555048 entries, 0 to 555047
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   x       555048 non-null  object
 1   y       555048 non-null  object
 2   target  555048 non-null  object
dtypes: object(3)
memory usage: 12.7+ MB


Unnamed: 0,x,y,target
0,1001,1001,0
1,1001,1051,0
2,1001,1101,0
3,1001,1151,0
4,1001,1201,0
...,...,...,...
555043,2051,901,1
555044,2051,951,1
555045,2101,1001,1
555046,2101,901,1


### train model from x and y axis

In [6]:
from sklearn.model_selection import train_test_split

X=data[['x' , 'y']]  # Features
y=data['target']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

### Accuracy

In [8]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7146803591268054


# Fit SVM to dataset

## training

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [10]:
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=0.5,test_size=0.5,random_state=123)

In [11]:
svc_rbf = SVC(kernel = 'rbf',gamma = 'auto', verbose = True , max_iter = 50000)
svc_linear = SVC(kernel='linear',gamma = 'auto', verbose = True , max_iter = 50000)

In [12]:
# fitting our models
svc_rbf.fit(X = x_train,y = y_train);
svc_linear.fit(X = x_train,y = y_train);

[LibSVM]



[LibSVM]



In [13]:
print(svc_linear.score(x_train,y_train))
print(svc_rbf.score(x_train,y_train))

0.5672878742018709
0.7169001599861634


In [14]:
print(svc_linear.score(x_test,y_test))
print(svc_rbf.score(x_test,y_test))

0.5659042100863348
0.7132392153471412


## Confusion matrix

In [15]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_true=y_test,y_pred=svc_linear.predict(x_test)).ravel()

print(f'training set: true negatives: {tn}')
print(f'training set: true positives: {tp}')
print(f'training set: false negatives: {fn}')
print(f'training set: false positives: {fp}')

training set: true negatives: 138478
training set: true positives: 18574
training set: false negatives: 60365
training set: false positives: 60107
