In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [24]:
dataset = pd.read_csv('Breast_cancer_data.csv')

In [25]:
x = dataset.iloc[:,0:5]
y = dataset['diagnosis']
X_train,X_test , Y_train, Y_test = train_test_split(x,y,test_size=0.25,random_state=0)
sc_X = StandardScaler()
X_std = sc_X.fit_transform(x)
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [26]:
classifier1 = GaussianNB()
classifier1.fit(X_train,Y_train)
Y_pred = classifier1.predict(X_test)
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_pred,Y_test))

[[46  7]
 [ 4 86]]
0.9230769230769231


In [27]:
cov_mat = np.cov(X_std.T)
eig_vals , eig_vecs = np.linalg.eig(cov_mat)
eig_pairs = [(np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]
print(eig_pairs)

[(3.1779263772362496, array([0.55371341, 0.24687793, 0.55562001, 0.55179039, 0.1388039 ])), (0.00139760342213733, array([ 0.70990216,  0.00714976, -0.70359047, -0.01400078,  0.02742701])), (0.01662165450666038, array([-0.3987202 ,  0.00514554, -0.41807654,  0.81614814,  0.01048524])), (0.7899679994812961, array([-0.17451457,  0.80887797, -0.14438462, -0.17093468,  0.51497137])), (1.0228891822550665, array([-0.0026939 , -0.53356361,  0.02473754,  0.00385895,  0.84538499]))]


In [28]:
eig_pairs.sort()
eig_pairs.reverse()


In [29]:
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)]
print(var_exp)

[63.44682538735288, 20.421829719538756, 15.771593100364715, 0.3318488492015147, 0.027902943542144225]


In [30]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(5,1),
                      eig_pairs[1][1].reshape(5,1),
                      eig_pairs[2][1].reshape(5,1)))
print(matrix_w)

[[ 0.55371341 -0.0026939  -0.17451457]
 [ 0.24687793 -0.53356361  0.80887797]
 [ 0.55562001  0.02473754 -0.14438462]
 [ 0.55179039  0.00385895 -0.17093468]
 [ 0.1388039   0.84538499  0.51497137]]


In [31]:
X_reduce = np.dot((X_std),matrix_w)
print(X_reduce)
print(type(X_reduce))

[[ 1.56207679  2.4644723  -1.4124361 ]
 [ 2.8010636  -0.46627365 -1.60092817]
 [ 2.84876769  0.59363682  0.08585125]
 ...
 [ 1.46986963 -1.78499745  0.90331942]
 [ 3.86552291  0.09399861  1.77196288]
 [-2.88347973 -3.32802642  0.19359216]]
<class 'numpy.ndarray'>


In [32]:
df_reduce = pd.DataFrame(X_reduce, columns = ['Column_A','Column_B','Column_C'])
df_reduce['diagnosis'] = dataset['diagnosis']
#df_reduce.head()
#print(df_reduce)
x_reduce = df_reduce.iloc[:,0:3]
y_reduce = df_reduce['diagnosis']
df_reduce.head()
print(x_reduce)
X_train_reduce,X_test_reduce , Y_train_reduce, Y_test_reduce = train_test_split(x_reduce,y_reduce,test_size=0.25,random_state=0)
X_train_reduce = sc_X.fit_transform(X_train_reduce)
X_test_reduce = sc_X.fit_transform(X_test_reduce)


     Column_A  Column_B  Column_C
0    1.562077  2.464472 -1.412436
1    2.801064 -0.466274 -1.600928
2    2.848768  0.593637  0.085851
3   -0.658477  2.624944  2.246609
4    2.718517  0.897870 -1.661423
..        ...       ...       ...
564  3.929944  0.550143  0.053511
565  3.322039 -0.983901  0.913880
566  1.469870 -1.784997  0.903319
567  3.865523  0.093999  1.771963
568 -2.883480 -3.328026  0.193592

[569 rows x 3 columns]


In [33]:
classifier1 = GaussianNB()
classifier1.fit(X_train_reduce,Y_train_reduce)
Y_pred_reduce = classifier1.predict(X_test_reduce)
print(confusion_matrix(Y_test_reduce,Y_pred_reduce))
print(accuracy_score(Y_pred_reduce,Y_test_reduce))

[[46  7]
 [ 6 84]]
0.9090909090909091
