# Libraries

In [163]:
import operator
import os
import ast
import glob

import numpy as np
import pandas as pd
import holoviews as hv
import hvplot.pandas
import holoviews

from time import time
from functools import reduce

from scipy import stats
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble.partial_dependence import plot_partial_dependence, partial_dependence
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer, RobustScaler
from sklearn.model_selection import KFold, ParameterSampler, RandomizedSearchCV
from sklearn.decomposition import PCA,KernelPCA
from sklearn.svm import SVC
from sklearn import metrics

from Tools.labelling import pseudo_fit, SemiSup_RandomizedSearchCV, write_out

In [164]:
np.random.seed(42)
hv.extension('bokeh')

# Data

In [3]:
# import data
path = os.path.join('.','Data','AnnotatedCells.csv')
data = pd.read_csv(path, index_col='Unnamed: 0')

# Recode Data
data.Human.loc[data.Human == 9] = 1

# Data must have variance
data = data.drop(columns=data.describe().columns[(data.var() == 0)])

# Must exist in both datasets or be our label
data = data.loc[:,['Human','FEATURE', 'SHAPE.angularity', 'SHAPE.angularity.amplitude',
                   'SHAPE.angularity.count', 'SHAPE.angularity.max',
                   'SHAPE.angularity.mean', 'SHAPE.angularity.median',
                   'SHAPE.angularity.mid', 'SHAPE.angularity.stdev',
                   'SHAPE.angularity.variation', 'SHAPE.area', 'SHAPE.aspectRatio',
                   'SHAPE.circularity', 'SHAPE.curvature', 'SHAPE.feret',
                   'SHAPE.feret.max', 'SHAPE.feret.min', 'SHAPE.length',
                   'SHAPE.orientation', 'SHAPE.perimeter', 'SHAPE.roundness',
                   'SHAPE.sinuosity', 'SHAPE.solidity', 'SHAPE.width',
                   'SHAPE.width.amplitude', 'SHAPE.width.count', 'SHAPE.width.max',
                   'SHAPE.width.mean', 'SHAPE.width.median', 'SHAPE.width.mid',
                   'SHAPE.width.min', 'SHAPE.width.stdev', 'SHAPE.width.variation']].dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [4]:
class_balance = data.Human.replace((1-data.Human.value_counts()/data.shape[0]).to_dict())
(data.Human.value_counts()/data.shape[0]).hvplot.bar(label='Label Imbalance')

## Exploratory Analysis

In [5]:
# We will explore kernel PCA to uncover 
#  possible mapping between data and the labels
kernels = ['linear','rbf','poly','cosine']

def pca_plot(kernel='linear'):
    pca_features = make_pipeline(RobustScaler(), 
                                 KernelPCA(kernel=f'{kernel}'))\
                        .fit_transform(data.drop(columns=['Human']))

    return pd.concat([pd.DataFrame(pca_features).iloc[:,:2],
               data.loc[:,['Human']]], axis=1)\
                    .hvplot.scatter(x='0', y='1', 
                                    color='Human',
                                    label=f'{kernel} Kernel PCA')\
                    .options(cmap='Plasma', color_levels=2)

pca_dict = {f:pca_plot(f) for f in kernels}

hmap = hv.HoloMap(pca_dict, kdims='kernels')
hmap

# Benchmark

SVM

In [6]:
# svc = SVC(kernel='poly')

# # specify parameters and distributions to sample from
# svc_param_dist = {"C": stats.uniform(0.5, 1.5),
#               "kernel": ['linear', 'poly', 'rbf', 'sigmoid']}

Random Forest

In [7]:
# build a classifier
rf = RandomForestClassifier()

# specify parameters and distributions to sample from
rf_param_dist = {"max_depth": stats.randint(3, 12),
              "max_features": stats.randint(3, 22),
              "min_samples_split": stats.randint(2, 11),
                 "n_estimators": stats.randint(10, 100),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
              "n_jobs":[-1],

Adaptive Boosting

In [8]:
# build a classifier
ab = AdaBoostClassifier()

# specify parameters and distributions to sample from
ab_param_dist = {"base_estimator":[DecisionTreeClassifier(max_depth=1)],
                 "base_estimator__max_depth": stats.randint(1, 12),
                 "base_estimator__min_samples_split": stats.randint(2, 11),
                 "base_estimator__criterion": ["gini", "entropy"],
                 "n_estimators":stats.randint(5, 75), 
                 "learning_rate":stats.beta(a=2,b=5)}

GBM

In [9]:
# build a classifier
gb = GradientBoostingClassifier()

# specify parameters and distributions to sample from
gb_param_dist = {"max_depth": stats.randint(1, 12),
                 "min_samples_split": stats.randint(2, 11),
                 "loss": ["exponential", "deviance"],
                 "n_estimators":stats.randint(5, 75), 
                 "learning_rate":stats.beta(a=2,b=5)}

## Supervised

In [10]:
models = dict(rf={'estimator':RandomForestClassifier(), 'param_distributions':rf_param_dist},
              ab={'estimator':AdaBoostClassifier(), 'param_distributions':ab_param_dist},
              gb={'estimator':GradientBoostingClassifier(), 'param_distributions':gb_param_dist},)

In [11]:
search = {f:RandomizedSearchCV(n_iter=50, cv=5, n_jobs=-1, return_train_score=False, **models[f])\
          .fit(X=data.drop(columns=['Human']), y=data.Human, sample_weight=class_balance) for f in models.keys()}

for model in search.keys():
    write_out(pd.DataFrame(search[model].cv_results_), model)

## Semi-supervised

__Pseudo Labelling__: Transductive

In [12]:
models = dict(rf={'estimator':RandomForestClassifier(), 'param_distributions':rf_param_dist},
              ab={'estimator':AdaBoostClassifier(), 'param_distributions':ab_param_dist},
              gb={'estimator':GradientBoostingClassifier(), 'param_distributions':gb_param_dist})

In [13]:
search = {f:SemiSup_RandomizedSearchCV(n_iter=50, cv=5, pseudo=True, **models[f])\
          .fit(X=data.drop(columns=['Human']), y=data.Human, sample_weight=class_balance) for f in models.keys()}

for model in search.keys():
    write_out(search[model].cv_results_, 'pseudo__'+model)

__Label Spreading__: Inductive

In [17]:
%%capture
ls_pipe = make_pipeline(RobustScaler(), LabelSpreading())

ls_param_dist = {'labelspreading__kernel':['knn','rbf'], 
              'labelspreading__gamma':stats.norm(loc=20, scale=10), 
              'labelspreading__n_neighbors':stats.randint(3, 40), 
              'labelspreading__alpha':stats.beta(a=2,b=5), 
              'labelspreading__max_iter':[30], 
              'labelspreading__tol':[0.001]}

ls_search = SemiSup_RandomizedSearchCV(estimator=ls_pipe, param_distributions=ls_param_dist, n_iter=100)

ls_search.fit(data.drop(columns=['Human']), data.Human)

# write out results to csv and serialized feature format
write_out(pd.DataFrame(ls_search.cv_results_), 'ls')

__Label Propogation__: Inductive

In [16]:
%%capture
lp_pipe = make_pipeline(RobustScaler(), LabelPropagation())

lp_param_dist = {'labelpropagation__kernel':['knn','rbf'], 
                 'labelpropagation__gamma':stats.norm(loc=20, scale=10), 
                 'labelpropagation__n_neighbors':stats.randint(3, 40), 
                 'labelpropagation__alpha':stats.beta(a=2,b=5), 
                 'labelpropagation__max_iter':[30], 
                 'labelpropagation__tol':[0.001]}

lp_search = SemiSup_RandomizedSearchCV(estimator=lp_pipe, param_distributions=lp_param_dist, n_iter=100)

lp_search.fit(data.drop(columns=['Human']), data.Human)

# write out results to csv and serialized feature format
write_out(pd.DataFrame(lp_search.cv_results_), 'lp')

__Naive Bayes__

In [16]:
# %%opts Histogram [width=300 height=200]

# #Plot of distributions
# reduce(operator.add, [ x[0] for x in data.astype('f8').apply(lambda x: [pd.Series(x).hvplot.hist()],0)])

In [17]:
# # Varitation Semi-supervised models

# from pomegranate import NaiveBayes, GammaDistribution, PoissonDistribution, UniformDistribution, NormalDistribution, BetaDistribution, BernoulliDistribution, LogNormalDistribution
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Human']).loc[:,:], data.Human, test_size=0.2)
# model = NaiveBayes.from_samples(distributions=GammaDistribution, X=pd.concat([X_train, X_test],0), y=pd.concat([y_train, pd.Series(-1).repeat(y_test.shape[0])],0))
# metrics.accuracy_score(y_test, model.predict(X=X_test))

# Model Selection

In [18]:
path = os.path.join('.','Data','Models','*.csv')
store = pd.concat(map(pd.read_csv, glob.glob(path)))

In [19]:
store['mean-variance'] = ((store.mean_test_score**1)/np.sqrt(store.std_test_score))

In [20]:
store['model_name'] = store['model'].replace({"model": dict(ab='AdaBoost', rf='RandomForest',gb='Gradient Boosting Machine', 
                                       pab='Pseudo-Labelling AdaBoost', prf='Pseudo-Labelling RF',pgb='Pseudo-Labelling GBM',
                                       lp='LabelPropogation',ls='LabelSpreading')})

In [22]:
%%output filename='./Media/LabellingModelSearchMeanVariance' fig='png'
%%opts Scatter [width=800 height=400 toolbar=None]
store.sort_values('mean-variance', ascending=False).head(100).hvplot.scatter(x='std_test_score', 
                      y='mean_test_score', 
                      color='model_name', size=15, 
                      alpha=0.8, 
                      label='Model Mean-Variance Plot')\
                        .redim.label(std_test_score='Standard Deviation of Test Score', 
                                     mean_test_score='Mean of Test Score') 



In [53]:
store.loc[~store.model.str.startswith('pseudo__'),:].groupby('model').apply(lambda x: pd.DataFrame(x)\
                                          .nlargest(1, ['mean-variance'])\
                                          .loc[:,~store.columns.str.startswith('params__')]).nlargest(3, ['mean_test_score'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,mean_test_score,std_test_score,mean_score_time,std_score_time,params,model,mean-variance,model_name
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rf,25,25,0.821156,0.013562,0.164577,0.059986,"{'bootstrap': False, 'criterion': 'gini', 'max...",rf,7.051276,rf
gb,36,36,0.816141,0.00623,0.003853,7.3e-05,"{'learning_rate': 0.17702691343177382, 'loss':...",gb,10.339917,gb
ab,38,38,0.796084,0.00674,0.002579,0.000286,{'base_estimator': DecisionTreeClassifier(clas...,ab,9.696974,ab


In [131]:
lookup = dict(rf=RandomForestClassifier(),
             gb=GradientBoostingClassifier(),
             ab=AdaBoostClassifier())

In [168]:
models = store.loc[~store.model.str.startswith('pseudo__'),:].groupby('model').apply(lambda x: pd.DataFrame(x)\
                                          .nlargest(1, ['mean-variance'])\
                                          .loc[:,~store.columns.str.startswith('params__')])\
                                          .nlargest(3, ['mean_test_score'])\
                                          .loc[:,['model','params']]\
                                          .apply(lambda x: (x[0],lookup[x[0]].set_params(**eval(x[1]))),1).tolist()

# Labelling

__Data__

In [135]:
# We import our unlabelled data
path = os.path.join('.','Data','AllCells.csv')
all_data = pd.read_csv(path, index_col='Unnamed: 0').dropna()

__Ensemble__

In [142]:
d = pd.concat([data, all_data], join='outer')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [169]:
# create ensemble
ensemble = VotingClassifier(estimators=models, voting='soft')

# We create a new classifier with out best parameters
ensemble.pseudo_fit = pseudo_fit.__get__(ensemble)

In [170]:
ensemble.fit(X=data.drop(columns=['Human']), y=data.Human)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=9, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fr...'best'),
          learning_rate=0.17281385282196798, n_estimators=5,
          random_state=None))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [224]:
ensemble.pseudo_fit = pseudo_fit.__get__(ensemble)

In [227]:
ensemble.pseudo_fit(X=d.loc[:,data.drop(columns=['Human']).columns], y=d.Human.fillna(-1))

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=9, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fr...'best'),
          learning_rate=0.17281385282196798, n_estimators=5,
          random_state=None))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

__Serializing__

In [228]:
from joblib import dump, load
dump(ensemble, './Data/Models/labelling_ensemble.joblib') 

['./Data/Models/labelling_ensemble.joblib']

__Output__

In [229]:
# Predict
pr = ensemble.predict_proba(X=all_data.loc[:,data.drop(columns=['Human']).columns.tolist()])
print(f'Pseudo-Labelling Presents a Yield of {(pd.DataFrame(pr).iloc[:,1]>0.75).sum()}')

Pseudo-Labelling Presents a Yield of 29223


In [230]:
# wrote to data
labelled_data = all_data
labelled_data['Machine'] = round(pd.DataFrame(pr).iloc[:,1]>0.75)

In [231]:
# We write out our results to disk
write_out(labelled_data, 'LabelledData', parent_dir=True, scores=False, feather=False)