In [None]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix

In [None]:
#import data set and convert diagnosis to numerical values
df = pd.read_csv(r'breast-cancer.csv')
df['diagnosis'].replace(['M','B'],[1,0],inplace=True) 
df.head()

In [None]:
#create correlation matrix 
d1= df.drop(columns=['id']) 
correlation_matrix = d1.corr()
dataplot = sns.heatmap(correlation_matrix, cmap="YlGnBu", annot=False)
plt.show()

In [None]:
#printing features that have important correlation with diagnosis
cor_target = abs(correlation_matrix['diagnosis']) 
relevant_features = cor_target[cor_target>0.1]
print('Relevant features and their covariance:')
print(relevant_features) 

In [None]:
#check class imbalance
df['diagnosis'].value_counts().plot(kind='bar')
df['diagnosis'].value_counts()
plt.ylabel('Number of Instances')
plt.xlabel('0:Benign and 1:Malignant')
plt.title('Class Imbalance')
plt.show()

In [None]:
x=df.drop(columns=['id','diagnosis']) #setting feature space
y=df['diagnosis'] #setting target column

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=109) #creating data split

d=DecisionTreeClassifier()
d.fit(x_train,y_train)

y_pred=d.predict(x_test)

#model accuracy with no pre-processing applied
print(f'The accuracy of the model is {d.score(x_test,y_test):.3f}')

In [None]:
#confusion matrix for the model
plot_confusion_matrix(d,x_test, y_test)
plt.show()

In [None]:
#feature importance for the model
importance = d.feature_importances_ 
indices=np.argsort(importance)[::-1]
#for i,v in enumerate(importance):
#    print('Feature: %0d, Score: %.5f'% (i,v))
plt.bar([x for x in range(len(importance))],importance)
plt.xticks(range(x_train.shape[1]),x_train.columns[indices],rotation=90)
plt.ylabel('Coefficient Weight')
plt.title('Feature Importance')
plt.show()

In [None]:
#script to eliminate features with low correlation
features_to_drop = cor_target[cor_target<0.1] 
to_drop_frame = features_to_drop.to_frame()
row_names = to_drop_frame.index
row_names_list = list(row_names)
row_names_list.append('diagnosis')
row_names_list.append('id')
y = df['diagnosis'].values
X = df.drop(row_names_list, axis=1)

In [None]:
#new data space for the comparative model
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.3,random_state=109) 
d2=DecisionTreeClassifier()
d2.fit(X_train,Y_train)
Y_pred2=d2.predict(X_test)

print(f'The accuracy of the model is {d2.score(X_test,Y_test):.3f}')

In [None]:
#confusion matrix for the new model
plot_confusion_matrix(d2,X_test, Y_test)
plt.show()

In [None]:
#feature importance for the model
importance2 = d2.feature_importances_ 
indices2=np.argsort(importance2)[::-1]
#for i,v in enumerate(importance2):
#    print('Feature: %0d, Score: %.5f'% (i,v))
plt.bar([x for x in range(len(importance2))],importance2)
plt.xticks(range(X_train.shape[1]),X_train.columns[indices2],rotation=90)
plt.ylabel('Coefficient Weight')
plt.title('Feature Importance')
plt.show()