In [1]:
import numpy
import pandas

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

from sklearn import svm

In [2]:
HEADERS = ["Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape",
           "Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin",
           "Normal Nucleoli","Mitoses","Class"]

dataset = pandas.read_csv("breast-cancer-wisconsin.data", names=HEADERS, low_memory=False)

## Import data

In [3]:
dataset.dtypes

Sample code number              int64
Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object

In [5]:
dataset.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
dataset.shape

(699, 11)

In [7]:
dataset.describe()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


## Split data into InputSet and OutputSet

In [15]:
xset = dataset.iloc[:,:10]
yset = dataset.iloc[:,10]

## Categories weighting

In [16]:
def categories_weighting(inputSet, column):
    categories = inputSet[column].copy()
    for ind in range(len(categories)):
        categories[ind] = categories[ind] if categories[ind] != '?' else -1
    inputSet = inputSet.drop(columns=[column])
    inputSet[column] = categories
    return inputSet

In [18]:
outputSet = categories_weighting(xset, 'Bare Nuclei')

## Split Data into training set and test set

In [19]:
xtrain, xtest, ytrain, ytest = train_test_split(xset, yset, test_size=0.1, random_state=2)

## Classification and Decision tree

In [20]:
decisionTree = DecisionTreeClassifier().fit(xtrain,ytrain)
decisionTreePredicted = decisionTree.predict(xtest)
decisionTree.score(xtest, ytest)

ValueError: could not convert string to float: ?

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytrain, decisionTreePredicted)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(ytest, decisionTreePredicted)[:3]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 

## Rregression tree

In [17]:
from sklearn.linear_model import LogisticRegression
regressionTree = LogisticRegression().fit(X=xtrain,y=ytrain)
regressionTreePredicted = regressionTree.predict(xtest)
regressionTree.score(xtest, ytest)

0.0

## Clusterisation kneighbors

In [14]:
cluster_inputSet = outputSet.drop(columns=['Round'])

In [15]:
xxtrain, xxtest, yytrain, yytest = train_test_split(cluster_inputSet, yIndSet, test_size=0.1, random_state=2)

from sklearn.neighbors import KNeighborsClassifier as kn
cd = kn(n_neighbors=2).fit(xxtrain, yytrain)