## Import Library

In [10]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

#from sklearn.matrics import r2_score

## Load Data

In [11]:
data = r'Breast_Cancer.csv'
data = pd.read_csv(data)
data

Unnamed: 0,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DedMalig,Breast,BreastQuad,Irradiant
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


## Cek Missing Value

In [52]:
data.isna().sum()

Class               0
Age                 0
Menopause           0
TumorSize           0
InvNodes            0
NodeCaps            0
DedMalig            0
Breast              0
BreastQuad          0
Irradiant           0
Class_trns          0
Age_trans           0
Menopause_trans     0
TumorSize_trans     0
InvNodes_trans      0
NodeCaps_trans      0
Breast_trans        0
BreastQuad_trans    0
Irradiant_trans     0
dtype: int64

## Cek Duplicate

In [53]:
duplicate = data[data.duplicated()]
duplicate.shape

(14, 19)

In [54]:
data = data.drop_duplicates(keep='first')
data.shape

(272, 19)

## Label Encoder

In [17]:
labelencoder = LabelEncoder()
data['Class_trns'] = labelencoder.fit_transform(data['Class'])
data['Age_trans'] = labelencoder.fit_transform(data['Age'])
data['Menopause_trans'] = labelencoder.fit_transform(data['Menopause'])
data['TumorSize_trans'] = labelencoder.fit_transform(data['TumorSize'])
data['InvNodes_trans'] = labelencoder.fit_transform(data['InvNodes'])
data['NodeCaps_trans'] = labelencoder.fit_transform(data['NodeCaps'])
data['Breast_trans'] = labelencoder.fit_transform(data['Breast'])
data['BreastQuad_trans'] = labelencoder.fit_transform(data['BreastQuad'])
data['Irradiant_trans'] = labelencoder.fit_transform(data['Irradiant'])

data

Unnamed: 0,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DedMalig,Breast,BreastQuad,Irradiant,Class_trns,Age_trans,Menopause_trans,TumorSize_trans,InvNodes_trans,NodeCaps_trans,Breast_trans,BreastQuad_trans,Irradiant_trans
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no,0,1,2,5,0,1,0,2,0
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no,0,2,2,3,0,1,1,5,0
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no,0,2,2,3,0,1,0,2,0
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no,0,4,0,2,0,1,1,3,0
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0,2,2,0,0,1,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no,1,1,2,5,0,1,0,3,0
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes,1,1,2,3,0,1,0,3,1
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no,1,4,0,3,0,1,1,3,0
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no,1,2,0,5,4,1,0,2,0


## Split Data Train and Data Test

In [18]:
#X = data[['Age_trans','Menopause_trans','TumorSize_trans','InvNodes_trans','NodeCaps_trans','DedMalig','Breast_trans','BreastQuad_trans','Irradiant_trans']]
#y = data[['Class_trns']]

X = data[['Class_trns','Age_trans','Menopause_trans','TumorSize_trans','InvNodes_trans','NodeCaps_trans','DedMalig','Breast_trans','BreastQuad_trans']]
y = data[['Irradiant_trans']]


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=25)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(228, 9)
(228, 1)
(58, 9)
(58, 1)


## Classification

In [50]:
NaiveBayes = MultinomialNB().fit(X_train,np.ravel(y_train,order='c'))
print(NaiveBayes)

MultinomialNB()


In [51]:
prediction = NaiveBayes.predict(X_test)
print(prediction)

from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))


[0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.96      0.86      0.91        51
           1       0.42      0.71      0.53         7

    accuracy                           0.84        58
   macro avg       0.69      0.79      0.72        58
weighted avg       0.89      0.84      0.86        58

