# Prepare Problem


In [1]:
import numpy
import pandas

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

from sklearn import datasets
from sklearn import svm


In [2]:
HEADER = ["number", "Clump Thickness", 
"Uniformity of Cell Size", "Uniformity of Cell Shape", 
"Marginal Adhesion", "Single Epithelial Cell Size", 
"Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", 
"Mitoses", "Class"]

dataset = pandas.read_csv('dataset.csv',names=HEADER)

In [3]:
dataset.dtypes

number                         int64
Clump Thickness                int64
Uniformity of Cell Size        int64
Uniformity of Cell Shape       int64
Marginal Adhesion              int64
Single Epithelial Cell Size    int64
Bare Nuclei                    int64
Bland Chromatin                int64
Normal Nucleoli                int64
Mitoses                        int64
Class                          int64
dtype: object

In [4]:
dataset.describe()

Unnamed: 0,number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,1076720.0,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,620644.0,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,63375.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,877617.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1171795.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1238705.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [5]:
dataset.shape

(683, 11)

In [6]:
dataset.head()

Unnamed: 0,number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Evaluate algorithm


In [7]:
xset = dataset.iloc[:, :-1].values
yset = dataset.iloc[:,  -1].values

### split out validation dataset

In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=False)
xtrain, xtest, ytrain, ytest = list(),list(),list(),list()
for trainI, testI in kf.split(xset):
    xtrain = xset[trainI]
    xtest = xset[testI]
    ytrain = yset[trainI]
    ytest = yset[testI]

## xtrain,xtest,ytrain,ytest = train_test_split(xset,yset, test_size=0.5, random_state=0)

### test option and evaluation metrics

In [9]:
from sklearn.metrics import r2_score
## r2_score(ytrain, ytest, multioutput='variance_weighted')

### spot check algorithm

In [10]:
resultDict = dict()

- Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
c = LogisticRegression().fit(X=xtrain,y=ytrain)
resultDict['LR'] = c.predict(xtest)
c.score(xtest, ytest)

0.8088235294117647

- Linear discriminant Analisys


In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
c = lda().fit(xtrain, ytrain)
resultDict['LDA'] = c.predict(xtest)
c.score(xtest, ytest)

1.0

- K Nearest Neighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier as kn
c = kn(n_neighbors=2).fit(xtrain, ytrain)
resultDict['KN'] = c.predict(xtest)
c.score(xtest, ytest)

0.7205882352941176

- Classifiaction and regression trees

In [14]:
c = DecisionTreeClassifier().fit(xtrain,ytrain)
resultDict['RT'] = c.predict(xtest)
c.score(xtest, ytest)

0.9852941176470589

- Support vector Machines

In [15]:
from sklearn.svm import SVC
c = SVC().fit(xtrain,ytrain)
resultDict['SVC'] = c.predict(xtest)
c.score(xtest, ytest)

0.8088235294117647

### Compare algorithm


In [16]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

prfDict = dict(resultDict)
for algo in prfDict.keys():
    prfDict[algo] = precision_recall_fscore_support(ytest, prfDict[algo], average='macro')[:3]
    
prf = pandas.DataFrame({
    'Algorithm': prfDict.keys(),
    'Precision': [item[0] for item in prfDict.values()],
    'Recall': [item[1] for item in prfDict.values()],
    'f1': [item[2] for item in prfDict.values()]
})

prf


  'precision', 'predicted', average, warn_for)


Unnamed: 0,Algorithm,Precision,Recall,f1
0,RT,0.964286,0.990909,0.976894
1,LDA,1.0,1.0,1.0
2,LR,0.404412,0.5,0.447154
3,KN,0.395161,0.445455,0.418803
4,SVC,0.404412,0.5,0.447154


# Improve Accuracy

### Algorithm Tunning

In [26]:
def get_f1_from_svm(xtrain, ytrain, xtest, ytest, C):
    c = SVC(C=C).fit(xtrain,ytrain)
    return precision_recall_fscore_support(ytest, c.predict(xtest))[2][0]

def foreach_C(xtrain,ytrain,xtunning,ytunning,C=[0.1,1,10,100]):
    f1s = list()
    for c in C:
        f1s.append(get_f1_from_svm(xtrain,ytrain,xtunning,ytunning, c))
    return f1s

def foreachKfold(xset,yset,kfold=10):
    meanList = list()
    kf = KFold(n_splits=10, shuffle=False)
    for trainI, testI in kf.split(xset):
        xxtrain = xset[trainI]
        yytrain = yset[trainI]
        xxtrain2, xxtunning, yytrain2, yytunning = train_test_split(xxtrain, yytrain, test_size=0.1, random_state=0)
        meanList.append(foreach_C(xxtrain2,yytrain2,xxtunning,yytunning))
        
    return meanList

In [32]:
f1List = foreachKfold(xtrain,ytrain)
f1Dict = pandas.DataFrame({
    'C=0.1': [line[0] for line in f1List],
    'C=1': [line[1] for line in f1List],
    'C=10': [line[2] for line in f1List],
    'C=100': [line[3] for line in f1List]
})

f1Dict

Unnamed: 0,C=0.1,C=1,C=10,C=100
0,0.741573,0.741573,0.741573,0.741573
1,0.741573,0.741573,0.741573,0.741573
2,0.727273,0.727273,0.727273,0.727273
3,0.782609,0.782609,0.782609,0.782609
4,0.769231,0.769231,0.769231,0.769231
5,0.782609,0.782609,0.782609,0.782609
6,0.769231,0.769231,0.769231,0.769231
7,0.769231,0.769231,0.769231,0.769231
8,0.755556,0.755556,0.755556,0.755556
9,0.769231,0.769231,0.769231,0.769231


In [41]:
{key:numpy.mean(f1Dict[key]) for key in f1Dict.keys()}



{'C=0.1': 0.7608114818471438,
 'C=1': 0.7608114818471438,
 'C=10': 0.7608114818471438,
 'C=100': 0.7608114818471438}