In [27]:
import numpy as np
import pandas as pd
from skimage.io import imread, imshow
import cv2
py.init_notebook_mode(connected=True)

from subprocess import check_output
print(check_output(["ls", "../input/train"]).decode("utf8"))

Type_1
Type_2
Type_3



In [4]:
from glob import glob
basepath = '../input/train/'

all_cervix_images = []

for path in sorted(glob(basepath + "*")):
    cervix_type = path.split("/")[-1]
    cervix_images = sorted(glob(basepath + cervix_type + "/*"))
    all_cervix_images = all_cervix_images + cervix_images

all_cervix_images = pd.DataFrame({'imagepath': all_cervix_images})
all_cervix_images['filetype'] = all_cervix_images.apply(lambda row: row.imagepath.split(".")[-1], axis=1)
all_cervix_images['type'] = all_cervix_images.apply(lambda row: row.imagepath.split("/")[-2], axis=1)

In [5]:
from collections import defaultdict

images = defaultdict(list)

for t in all_cervix_images['type'].unique():
    sample_counter = 0
    for _, row in all_cervix_images[all_cervix_images['type'] == t].iterrows():
        #print('reading image {}'.format(row.imagepath))
        try:
            img = imread(row.imagepath)
            sample_counter +=1
            images[t].append(img)
        except:
            print('image read failed for {}'.format(row.imagepath))
        if sample_counter > 35:
            break
           

In [6]:
dfs = []
for t in all_cervix_images['type'].unique():
    t_ = pd.DataFrame(
        {
            'nrows': list(map(lambda i: i.shape[0], images[t])),
            'ncols': list(map(lambda i: i.shape[1], images[t])),
            'nchans': list(map(lambda i: i.shape[2], images[t])),
            'type': t
        }
    )
    dfs.append(t_)

shapes_df = pd.concat(dfs, axis=0)
shapes_df_grouped = shapes_df.groupby(by=['nchans', 'ncols', 'nrows', 'type']).size().reset_index().sort_values(['type', 0], ascending=False)

In [7]:
def transform_image(img, rescaled_dim, to_gray=False):
    resized = cv2.resize(img, (rescaled_dim, rescaled_dim), cv2.INTER_LINEAR)

    if to_gray:
        resized = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY).astype('float')
    else:
        resized = resized.astype('float')

    normalized = cv2.normalize(resized, None, 0.0, 1.0, cv2.NORM_MINMAX)
    timg = normalized.reshape(1, np.prod(normalized.shape))

    return timg/np.linalg.norm(timg)

rescaled_dim = 100

all_images = []
all_image_types = []

for t in all_cervix_images['type'].unique():
    all_images = all_images + images[t]
    all_image_types = all_image_types + len(images[t])*[t]

# - normalize each uint8 image to the value interval [0, 1] as float image
# - rgb to gray
# - downsample image to rescaled_dim X rescaled_dim
# - L2 norm of each sample = 1
gray_all_images_as_vecs = [transform_image(img, rescaled_dim) for img in all_images]

gray_imgs_mat = np.array(gray_all_images_as_vecs).squeeze()
all_image_types = np.array(all_image_types)
gray_imgs_mat.shape, all_image_types.shape

((108, 30000), (108,))

In [8]:
gray_imgs_mat.shape

(108, 30000)

----------

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
y = LabelEncoder().fit_transform(all_image_types).reshape(-1)
X = gray_imgs_mat # no need for normalizing, we already did this earlier Normalizer().fit_transform(gray_imgs_mat)
X.shape, y.shape

((108, 30000), (108,))

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((72, 30000), (36, 30000), (72,), (36,))

In [12]:
y_train, y_test

(array([0, 1, 0, 1, 2, 1, 0, 0, 2, 0, 1, 0, 1, 1, 2, 0, 0, 2, 2, 0, 1, 1, 0,
        0, 0, 0, 1, 0, 2, 0, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1,
        2, 0, 2, 1, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 2, 2, 2, 2, 2, 0, 1, 1, 0,
        2, 1, 2]),
 array([2, 0, 0, 2, 1, 1, 0, 1, 2, 0, 2, 2, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 1,
        0, 0, 0, 1, 2, 0, 1, 2, 2, 0, 0, 1, 2]))


Logistic Regression
===================


In [13]:
clf = LogisticRegression()
grid = {
    'C': [1e-9, 1e-6, 1e-3, 1e0],
    'penalty': ['l1', 'l2']
}
cv = GridSearchCV(clf, grid, scoring='neg_log_loss', n_jobs=-1, verbose=1)
cv.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:   13.7s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   15.0s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [1e-09, 1e-06, 0.001, 1.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=1)

In [14]:
for i in range(1, len(cv.cv_results_['params'])+1):
    rank = cv.cv_results_['rank_test_score'][i-1]
    s = cv.cv_results_['mean_test_score'][i-1]
    sd = cv.cv_results_['std_test_score'][i-1]
    params = cv.cv_results_['params'][i-1]
    print("{0}. Mean validation neg log loss: {1:.6f} (std: {2:.6f}) - {3}".format(
        rank,
        s,
        sd,
        params
    ))

5. Mean validation neg log loss: -1.098612 (std: 0.000000) - {'C': 1e-09, 'penalty': 'l1'}
4. Mean validation neg log loss: -1.098612 (std: 0.000000) - {'C': 1e-09, 'penalty': 'l2'}
5. Mean validation neg log loss: -1.098612 (std: 0.000000) - {'C': 1e-06, 'penalty': 'l1'}
3. Mean validation neg log loss: -1.098612 (std: 0.000000) - {'C': 1e-06, 'penalty': 'l2'}
5. Mean validation neg log loss: -1.098612 (std: 0.000000) - {'C': 0.001, 'penalty': 'l1'}
2. Mean validation neg log loss: -1.098585 (std: 0.000032) - {'C': 0.001, 'penalty': 'l2'}
8. Mean validation neg log loss: -1.098710 (std: 0.000749) - {'C': 1.0, 'penalty': 'l1'}
1. Mean validation neg log loss: -1.091674 (std: 0.028704) - {'C': 1.0, 'penalty': 'l2'}


**Neural Network**
------------------

In [23]:
from sklearn.neural_network import MLPClassifier
cv = MLPClassifier()


cv = GridSearchCV(clf, param_grid={
    'activation': ["logistic", 'relu', 'tanh']} , scoring = 'neg_log_loss',verbose = 1)
cv.fit(X_train, y_train)

for i in range(1, len(cv.cv_results_['params'])+1):
    rank = cv.cv_results_['rank_test_score'][i-1]
    s = cv.cv_results_['mean_test_score'][i-1]
    sd = cv.cv_results_['std_test_score'][i-1]
    params = cv.cv_results_['params'][i-1]
    print("{0}. Mean validation neg log loss: {1:.6f} (std: {2:.6f}) - {3}".format(
        rank,
        s,
        sd,
        params
    ))

Fitting 3 folds for each of 3 candidates, totalling 9 fits



Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  8.6min finished


1. Mean validation neg log loss: -1.961057 (std: 0.506868) - {'activation': 'logistic'}
2. Mean validation neg log loss: -2.524468 (std: 0.748185) - {'activation': 'relu'}
3. Mean validation neg log loss: -2.739268 (std: 0.762039) - {'activation': 'tanh'}



Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.



Random Forest
=======

In [24]:
from sklearn.ensemble import RandomForestClassifier

clf_RF = RandomForestClassifier()

param_grid = {
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
cv = GridSearchCV(clf_RF, param_grid=param_grid, cv = 10, scoring = 'neg_log_loss')
cv.fit(X_train, y_train)

for i in range(1, len(cv.cv_results_['params'])+1):
    rank = cv.cv_results_['rank_test_score'][i-1]
    s = cv.cv_results_['mean_test_score'][i-1]
    sd = cv.cv_results_['std_test_score'][i-1]
    params = cv.cv_results_['params'][i-1]
    print("{0}. Mean validation neg log loss: {1:.6f} (std: {2:.6f}) - {3}".format(
        rank,
        s,
        sd,
        params
    ))

1. Mean validation neg log loss: -2.059936 (std: 1.733276) - {'bootstrap': True, 'criterion': 'gini'}
4. Mean validation neg log loss: -3.862147 (std: 3.467242) - {'bootstrap': True, 'criterion': 'entropy'}
3. Mean validation neg log loss: -3.855064 (std: 3.761134) - {'bootstrap': False, 'criterion': 'gini'}
2. Mean validation neg log loss: -2.478046 (std: 2.043563) - {'bootstrap': False, 'criterion': 'entropy'}
