In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', 
                 header=None, 
                 names=['id', 'diagnosis', 'radius_mean', 'texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave_points_worst','symmetry_worst','fractal_dimension_worst'])
df.head()

In [None]:
df.drop(['id','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave_points_worst','symmetry_worst','fractal_dimension_worst'], 
        axis=1, inplace=True)
df.head()

In [None]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['diagnosis']))}
class_mapping

In [None]:
df['diagnosis'] = df['diagnosis'].map(class_mapping)
df.head()

In [None]:

cols = ['radius_mean', 'texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean']

sns.pairplot(df[cols], size=2.5)
plt.tight_layout()
plt.show()



In [None]:
cm = np.corrcoef(df[cols].values.T)
hm = sns.heatmap(cm,
                 cbar=True,
                 annot=True,
                 square=True,
                 fmt='.2f',
                 annot_kws={'size': 5},
                 yticklabels=cols,
                 xticklabels=cols)

plt.tight_layout()
plt.show()


In [None]:
df.drop(['perimeter_mean','area_mean'], axis=1, inplace=True)
df.head()

In [None]:
%matplotlib inline

# select outcomes
y = df.iloc[:, 0].values

# extract mean radius and texture
X = df.iloc[0:100, [1, 2]].values

# plot data
plt.scatter(X[:50, 0], X[:50, 1],
             color='red', marker='o', label='Malignant')
plt.scatter(X[50:100, 0], X[50:100, 1],
             color='blue', marker='x', label='Benign')
plt.title("Two Feature Visualization")
plt.xlabel('Mean Radius')
plt.ylabel('Mean Texture')
plt.legend(loc='upper left')
plt.show()

In [None]:
# Get the feature vector
X = df.iloc[:, 1:].values

# Get the target vector
y = df.iloc[:, 0].values


In [None]:
from sklearn.model_selection import train_test_split

# Randomly choose 30% of the data for testing (set randome_state as 0)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler

# Declare the standard scaler
std_scaler = StandardScaler()

# Standardize the training set

X_train = std_scaler.fit_transform(X_train)

# Standardize the testing set

X_test = std_scaler.transform(X_test)



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)
dtree_fscore = precision_recall_fscore_support(y_test, y_pred, average='micro')
dtree_fscore

In [None]:
from sklearn.tree import export_graphviz
import graphviz
dot_data = export_graphviz(dtree,
                           filled=True, 
                           rounded=True,
                           class_names=['benign','malignant'],
                           feature_names=['radius_mean', 'texture_mean','smoothness_mean','compactness_mean','concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean'],
                           special_characters=True,
                           out_file=None)
graph = graphviz.Source(dot_data)
graph.render("Decison Tree")
graph