# Predicting Malignancy of Breast Cancer

*Vladimir STEFAN, Maxime, Ana*

## Data Preprocessing

In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
import sklearn



In [2]:
from sklearn import svm

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
from sklearn.neural_network import MLPClassifier

In [4]:
#from sklearn.preprocessing import normalize

dataset=np.genfromtxt('wdbc.csv', delimiter=',', converters ={0:lambda x: 1.0*int(x[0] == 77)})

x = dataset[:,1:]
y = dataset[:, 0]

In [5]:
#normalize(x) 
scaler = StandardScaler()
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y,  test_size = .2, shuffle = False)

## Logistic Regression

**Questions:**
- **We get lower TRAIN accuracy for optimized parameters.**
- **We get same  TEST accuracy for optimized parameters.**

In [20]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
print('default log reg score on train ', lr.score(x_train, y_train))

default log reg score on train  0.984615384615


In [21]:
param_grid=[{'C':[0.001, 0.003, 0.01,0.03, 0.1, 0.3, 1], 'max_iter':[2,3,4,5,10,20,30]}]
lrgrid = GridSearchCV(lr, param_grid, cv=10, scoring='accuracy')
lrgrid.fit(x_train, y_train)
print('10-fold CV log reg score on train', lrgrid.best_score_)

10-fold CV log reg score on train 0.9802197802197802


In [22]:
print('best parameters are: ', lrgrid.best_params_)

best parameters are:  {'C': 0.1, 'max_iter': 4}


In [23]:
lrbest = LogisticRegression(C = 0.1, max_iter = 4)
lrbest.fit(x_train, y_train)
print('default log reg on test ', lr.score(x_test, y_test))
print('10-fold CV log reg score on test: ', lrbest.score(x_test, y_test))

default log reg on test  0.982456140351
10-fold CV log reg score on test:  0.982456140351


## SVM

#### Linear SVM

**Questions:**
- **We get lower TRAIN accuracy for optimized parameters.**
- **We get same  TEST accuracy for optimized parameters.**

In [8]:
lin_svm = svm.LinearSVC()

In [10]:
lin_svm.fit(x_train, y_train)
print('default lin svm score on train ', lin_svm.score(x_train, y_train))

default lin svm score on train  0.984615384615


In [11]:
print('default parameters are: ', lin_svm.get_params())

default parameters are:  {'C': 1.0, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [12]:
lin_svm_param_grid = [
  {'C': [1, 10, 100, 1000]}
 ]
lin_svmgrid = GridSearchCV(lin_svm, lin_svm_param_grid, cv=10, scoring='accuracy')
lin_svmgrid.fit(x_train, y_train)
print('10-fold CV lin svm score on train', lin_svmgrid.best_score_)

10-fold CV lin svm score on train 0.9560439560439561


In [13]:
print('best parameters are: ', lin_svmgrid.best_params_)

best parameters are:  {'C': 1}


In [7]:
lin_svmbest = svm.LinearSVC(C=1)

In [19]:
lin_svmbest.fit(x_train, y_train)
print('default lin svm on test ', lin_svm.score(x_test, y_test))
print('10-fold CV lin svm score on test: ', lin_svmbest.score(x_test, y_test))

default lin svm on test  0.964912280702
10-fold CV lin svm score on test:  0.964912280702


#### SVM SVC

**Questions:** 
- **We get lower TRAIN and TEST accuracy for optimized parameters. **
- **What parameters other than C and kernel to optimize?**

In [9]:
svm=svm.SVC()

In [14]:
svm.fit(x_train, y_train)
print('default svm score on train ', svm.score(x_train, y_train))

default svm score on train  0.984615384615


In [15]:
print('default parameters are: ', svm.get_params())

default parameters are:  {'C': 1.0, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [16]:
svm_param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'kernel': ['rbf']},
 ]
svmgrid = GridSearchCV(svm, svm_param_grid, cv=10, scoring='accuracy')
svmgrid.fit(x_train, y_train)
print('10-fold CV svm score on train', svmgrid.best_score_)

10-fold CV svm score on train 0.9758241758241758


In [17]:
print('best parameters are: ', svmgrid.best_params_)

best parameters are:  {'C': 1, 'kernel': 'linear'}


In [6]:
svmbest = svm.SVC(C = 1, kernel = 'linear')

In [18]:
svmbest.fit(x_train, y_train)
print('default svm on test ', svm.score(x_test, y_test))
print('10-fold CV svm score on test: ', svmbest.score(x_test, y_test))

default svm on test  0.973684210526
10-fold CV svm score on test:  0.964912280702


## Random Forest

**Questions:**
- **We get lower TRAIN accuracy for optimized parameters.**
- **We get same TEST accuracy for optimized parameters.**

In [25]:
randfor=RandomForestClassifier()
randfor.fit(x_train, y_train)
print('default rand for on train ', randfor.score(x_train, y_train))

default rand ofr on train  0.997802197802


In [39]:
#print('default parameters are: ', randfor.get_params())

In [28]:
randfor_param_grid=[{'n_estimators':[10, 500, 1000], 'max_features': ['auto','log2', None], 'criterion': ['gini', 'entropy']}]
randforgrid = GridSearchCV(randfor, randfor_param_grid, cv=10, scoring='accuracy')
randforgrid.fit(x_train, y_train)
print('10-fold CV rand for score on train', randforgrid.best_score_)

10-fold CV rand for score on train 0.9626373626373627


In [29]:
print('best parameters are: ', randforgrid.best_params_)

best parameters are:  {'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 500}


In [30]:
randforbest = RandomForestClassifier(criterion = 'entropy', max_features = 'auto', n_estimators = 500)

In [32]:
randforbest.fit(x_train, y_train)
print('default rand for on test ', randfor.score(x_test, y_test))
print('10-fold CV rand for score on test: ', randforbest.score(x_test, y_test))

default rand for on test  0.973684210526
10-fold CV rand for score on test:  0.973684210526


## Neural Network

**Questions:**
- **Lot of pink errors**
- ** Did I use the correct function (MLPClassifier())? I used it because it is actually NeuralNetworks.MLPClassifier() in the documentation.**
- ** We get lower TRAIN accuracy for optimized parameters.**
- **We get same TEST accuracy for optimized parameters.**
- **TEST accuracy is EQUAL to the one of Random Forest. How likely is that?!**

In [37]:
nn=MLPClassifier()
nn.fit(x_train, y_train)
print('default nn on train ', nn.score(x_train, y_train))

default nn on train  0.993406593407




In [43]:
print('default parameters are: ', nn.get_params())

default parameters are:  {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 200, 'momentum': 0.9, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}


In [48]:
nn_param_grid=[{'learning_rate' : ['constant', 'invscaling', 'adaptive'], 
                'momentum':[0.1,0.3,0.9], 
                'activation':['identity', 'logistic', 'tanh', 'relu']}]
nngrid = GridSearchCV(nn, nn_param_grid, cv=10, scoring='accuracy')
nngrid.fit(x_train, y_train)
print('10-fold CV nn score on train', nngrid.best_score_)



10-fold CV rand for score on train 0.9758241758241758


In [49]:
print('best parameters are: ', nngrid.best_params_)

best parameters are:  {'activation': 'identity', 'learning_rate': 'constant', 'momentum': 0.9}


In [51]:
nnbest=MLPClassifier(activation = 'identity', learning_rate = 'constant', momentum = 0.9)

In [54]:
nnbest.fit(x_train, y_train)
print('default nn on test ', nn.score(x_test, y_test))
print('10-fold CV nn score on test: ', nnbest.score(x_test, y_test))

default nn on test  0.973684210526
10-fold CV nn score on test:  0.973684210526
