In [1]:
import numpy
import pandas

from sklearn.model_selection import train_test_split

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

from sklearn import svm

In [2]:
## https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names

HEADERS = ["Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape",
           "Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin",
           "Normal Nucleoli","Mitoses","Class"]

dataset = pandas.read_csv("breast-cancer-wisconsin.data", names=HEADERS, low_memory=False)

## Import data

In [3]:
dataset.dtypes

Sample code number              int64
Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object

In [4]:
dataset.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
dataset.shape

(699, 11)

In [6]:
dataset.describe()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


## Split data into InputSet and OutputSet

In [7]:
xset = dataset.iloc[:,:10]
yset = dataset.iloc[:,10]

## Categories weighting

In [8]:
def categories_weighting(inputSet, column):
    categories = inputSet[column].copy()
    for ind in range(len(categories)):
        categories[ind] = int(categories[ind]) if categories[ind] != '?' else -1
    inputSet = inputSet.drop(columns=[column])
    inputSet[column] = categories
    return inputSet

In [9]:
outputSet = categories_weighting(xset, 'Bare Nuclei')
outputSet.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Bare Nuclei
0,1000025,5,1,1,1,2,3,1,1,1
1,1002945,5,4,4,5,7,3,2,1,10
2,1015425,3,1,1,1,2,3,1,1,2
3,1016277,6,8,8,1,3,3,7,1,4
4,1017023,4,1,1,3,2,3,1,1,1


## Split Data into training set and test set

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(outputSet, yset, test_size=0.5, random_state=2)

## Deleting one row on test set in order to make both train and test rows size equal

In [11]:
xtest = xtest.drop(xtest.index[-1])
ytest = ytest.drop(ytest.index[-1])

In [32]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn import tree

def prf(ytest, predicted, zipResult=True):
    prf1 = precision_recall_fscore_support(ytest, predicted)[:3]
    if (zipResult):
        return zip(["Precision", "Recal", "F1"], [numpy.mean(row) for row in prf1])
    return [numpy.mean(row) for row in prf1]

def get_dot_tree(tree_struct, file_name):
    tree.export_graphviz(tree_struct, out_file=file_name,
                     feature_names=HEADERS[:-1],  
                     class_names=["benign","malignant"],
                     filled=True, rounded=True)

## Classification: Decision tree

In [33]:
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier().fit(xtrain,ytrain)
decisionTreePredicted = decisionTree.predict(xtest)
print("Score: %f" % decisionTree.score(xtest, ytest))

Score: 0.916905


In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytrain, decisionTreePredicted)

array([[160,  73],
       [ 71,  45]])

In [35]:
decision_prf = prf(ytest, decisionTreePredicted, zipResult=False)
decision_prf

[0.9144287915474356, 0.9034464285714285, 0.908461086238864]

In [36]:
decision_tree_dot = get_dot_tree(decisionTree, "decision_tree_dotFile.dot")

![Decision tree representation](dtree.jpg)

## Classification: Regression Tree

In [50]:
from sklearn.linear_model import LogisticRegression
regressionTree = LogisticRegression().fit(X=xtrain,y=ytrain)
regressionTreePredicted = regressionTree.predict(xtest)
print("Score: %f" % regressionTree.score(xtest, ytest))

Score: 0.641834


In [51]:
confusion_matrix(ytrain, regressionTreePredicted)

array([[233,   0],
       [116,   0]])

In [52]:
regression_prf = prf(ytest, regressionTreePredicted, zipResult=False)
regression_prf

[0.3209169054441261, 0.5, 0.3909249563699826]

In [53]:
## regression_tree_dot = get_dot_tree(regressionTree, "regression_tree_dotFile.dot")

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

## Clustering: Kneighbors

In [20]:
from sklearn.neighbors import KNeighborsClassifier 
Nn = range(1,10)

def forN_kn(xtrain, xtest, ytrain, ytest, N=Nn):
    resultSet = []
    for n in N:
        predicted = KNeighborsClassifier(n_neighbors=n).fit(xtrain, ytrain).predict(xtest)
        resultSet.append(prf(ytest, predicted, zipResult=False))
    return resultSet

In [21]:
kn_results = forN_kn(xtrain, xtest, ytrain, ytest)
kn_results

[[0.5431249541419033, 0.5419821428571429, 0.5423054311943202],
 [0.5838983050847457, 0.5477321428571429, 0.530414096726623],
 [0.5655711677383504, 0.5589999999999999, 0.5593310698953393],
 [0.5795819935691318, 0.5335892857142857, 0.499741987271372],
 [0.5554408807028687, 0.5374107142857143, 0.5258152173913043],
 [0.5875132998936008, 0.541125, 0.5139712488769093],
 [0.558141447368421, 0.537875, 0.5247554599224541],
 [0.6007751937984496, 0.5473571428571429, 0.5218104222821204],
 [0.5868421052631578, 0.5565714285714285, 0.5465223854221889]]

In [22]:
clustering_prf = pandas.DataFrame({
    'Index': ["Clustering N=%d" % (i) for i in Nn],
    'Precision': [line[0] for line in kn_results],
    'Recal': [line[1] for line in kn_results],
    'F1': [line[2] for line in kn_results]
})

In [23]:
merged_resultSet = clustering_prf.set_index('Index')
merged_resultSet

Unnamed: 0_level_0,F1,Precision,Recal
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clustering N=1,0.542305,0.543125,0.541982
Clustering N=2,0.530414,0.583898,0.547732
Clustering N=3,0.559331,0.565571,0.559
Clustering N=4,0.499742,0.579582,0.533589
Clustering N=5,0.525815,0.555441,0.537411
Clustering N=6,0.513971,0.587513,0.541125
Clustering N=7,0.524755,0.558141,0.537875
Clustering N=8,0.52181,0.600775,0.547357
Clustering N=9,0.546522,0.586842,0.556571


## Gather all resultSets

In [24]:
def new_pd_single_row(index_name, prf):
    return  pandas.DataFrame({
        'Index': [index_name], 
        "F1": [prf[2]], 
        "Recal": [prf[1]], 
        "Precision": [prf[0]]
    }).set_index('Index')

In [25]:
merged_resultSet = merged_resultSet.append(new_pd_single_row('Decision Tree', decision_prf))
merged_resultSet = merged_resultSet.append(new_pd_single_row('Regression Tree', regression_prf))
merged_resultSet

Unnamed: 0_level_0,F1,Precision,Recal
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clustering N=1,0.542305,0.543125,0.541982
Clustering N=2,0.530414,0.583898,0.547732
Clustering N=3,0.559331,0.565571,0.559
Clustering N=4,0.499742,0.579582,0.533589
Clustering N=5,0.525815,0.555441,0.537411
Clustering N=6,0.513971,0.587513,0.541125
Clustering N=7,0.524755,0.558141,0.537875
Clustering N=8,0.52181,0.600775,0.547357
Clustering N=9,0.546522,0.586842,0.556571
Decision Tree,0.937679,0.937679,0.937679
