In [None]:
from lightjob.cli import load_db
from lightjob.db import SUCCESS, RUNNING
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
from pyearth import Earth
from sklearn.gaussian_process import GaussianProcess
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score, KFold, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.gaussian_process import GaussianProcess
from sklearn.base import clone
from sklearn.cross_validation import train_test_split
import json
import collections
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
%matplotlib inline

class EarthOneVsRestClassifier(BaseEstimator):

    def __init__(self, **params):
        pipeline = Pipeline([
                ('earth', (Earth(**params))),
                ('logistic', LogisticRegression())
            ])
        self.clf = OneVsRestClassifier(pipeline)

    def fit(self, X, y):
        return self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)

class EnsembleRegressor(object):
    def __init__(self, regs=None):
        self.regs = regs

    def fit(self, X, y):
        return self

    def predict(self, X, return_std=False):
        if return_std:
            means = []
            stds = []
            for r in self.regs:
                m, s = r.predict(X, return_std=True)
                means.append(m)
                stds.append(s)
            means = np.vstack(means).T
            stds = np.vstack(stds).T
            return np.mean(means, axis=1), (np.sqrt((stds**2).sum(axis=1)) / stds.shape[1])
        else:
            preds = np.vstack([r.predict(X) for r in self.regs]).T
            return np.mean(preds, axis=1)


def plot_imp(names, values):
    ind = (np.arange(len(names)))
    plt.xticks(ind + 0.5, names)
    plt.bar(ind, values, width=1)
    plt.xlabel('variable index')
    plt.ylabel('importance')

    
def mse(model, X, y):
    m = model
    return ((m.predict(X) - y)**2).mean() 

def acc(model, X, y):
    return (model.predict(X) == y).mean()

def evaluate(model, X, y, score=mse):
    train_scores = []
    test_scores = []
    models = []
    for train, test in KFold(X.shape[0], n_folds=2, shuffle=True, random_state=4):
        m = clone(model)
        m.fit(X[train], y[train])
        train_scores.append(score(m, X[train], y[train]) )
        test_scores.append(score(m, X[test], y[test]) )
        models.append(m)
    train_scores = np.array(train_scores)
    test_scores = np.array(test_scores)
    return models, train_scores, test_scores

from sklearn.svm import SVR

def smooth_image(x, y, z, w=100, h=100, model=SVR()):
    X = np.vstack((x, y)).T
    model.fit(X, z)
    x, y = np.meshgrid(
        np.linspace(x.min(), x.max(), w),
        np.linspace(y.min(), y.max(), h)
    )
    x = x.flatten()
    y = y.flatten()
    xs = np.vstack((x, y)).T
    zs = model.predict(xs)
    zs = zs.reshape((w, h))
    return zs

def flatten_dict(l):
    d = {}
    for k, v in l.items():
        if isinstance(v, collections.Mapping):
            d.update(flatten_dict(v))
        elif isinstance(v, list) or isinstance(v, tuple):
            for i, l in enumerate(v):
                d[k+'_{}'.format(i)] = l
        else:
            d[k] = v
    return d

In [None]:
db = load_db()

In [None]:
jobs = db.jobs_with(state=SUCCESS)
jobs = list(jobs)
jobs = filter(lambda j:j['content']['dataset']=='mnist', jobs)

In [1]:
stats = defaultdict(list)
for j in jobs:

    for k, v in j['content']['model'].items():
        stats[k].append(v)
    
    if j['hist'] is not None:  
        for h in j['hist'][0].keys():
            d = [a[h] for a in j['hist']]
            stats[h].append(d)
    stats['summary'].append(j['summary'])
    stats['where'].append(j['where'])
df = pd.DataFrame(stats)

df['last_g_loss'] = df['g_loss'].apply(lambda l:l[-1])
df['last_d_loss'] = df['d_loss'].apply(lambda l:l[-1])
df['min_g_loss'] = df['g_loss'].apply(lambda l:np.min(l))
df['min_d_loss'] = df.apply(lambda c:c['d_loss'][np.argmin(c['g_loss'])], axis=1)
df['ratio_loss'] = df['last_g_loss'] / df['last_d_loss']

NameError: name 'defaultdict' is not defined

In [None]:
pd.set_option('display.max_columns', None)
df = df.sort_values(by='ratio_loss', ascending=True)

In [None]:
from IPython.display import Image, display
summaries = df['summary']
for s in summaries:
    filename = 'results/{}/samples00100.png'.format(s)
    print(filename)
    display(Image(filename))

In [None]:
df_x = df[['num_filters_d', 'num_filters_g', 'scale', 'do_batch_norm', 'start_w', 'start_h', 'filter_size', 'where']]
df_x = pd.get_dummies(df_x)
colnames = df_x.columns
df_x.head()

In [None]:
X = df_x.values
y = df[['last_d_loss']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
models = {
   'Earth': Earth(max_degree=2, max_terms=10, 
                  smooth=False, thresh=0, minspan=1, 
                  check_every=1,
                  verbose=0,
                  feature_importance_type='rss',
                  endspan=1),
    'RandomForestRegressor': RandomForestRegressor(max_depth=20, n_estimators=10),
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(alpha=0.1),
    'DummyRegressor': DummyRegressor(),
}
result = {}
cols = defaultdict(list)
for name, m in models.items():
    models, train, valid = evaluate(m, X_train, y_train)
    cols['model'].append(m.__class__.__name__)
    cols['train_mean'].append(train.mean())
    cols['train_std'].append(train.std())
    cols['valid_mean'].append(valid.mean())
    cols['valid_std'].append(valid.std())
    #cols['test'].append
    result[name] = models
#earth = models[0]
#earth.fit(X, y)
results = pd.DataFrame(cols)
results = results.sort_values(by='valid_mean')
results

In [None]:
last_model = result[results.iloc[0]['model']][0]
mse(last_model, X_test, y_test)

## Linear regression

In [None]:
lin = result['LinearRegression'][0]

In [None]:
for imp in lin.coef_:
    indices = range(len(imp))
    low = sorted(indices, key=lambda i:imp[i])
    low = low[0:4]
    high = sorted(indices, key=lambda i:-imp[i])
    high = high[0:4]
    indices = low + high
    names = colnames
    names = map(lambda i:names[i], indices)
    imp = map(lambda i:imp[i], indices)
    fig = plt.figure(figsize=(12, 8))
    plot_imp(names, imp)
    plt.show()

## Earth

In [None]:
earth = result['earth'][0]
print(earth.summary())

In [None]:
fig = plt.figure(figsize=(20, 5))
plot_imp(colnames, earth.feature_importances_)

## By using images

In [None]:
df = pd.read_csv('http://127.0.0.1:20000/export_data?type=classification&class=gan')

In [None]:
df.head()

In [None]:
colnames = set()
for i in range(len(df)):
    h = df['hypers'].iloc[i]
    h = json.loads(h)
    h = flatten_dict(h)
    colnames |= set(h.keys())
colnames = list(colnames)
print(colnames)
for col in colnames:
    df[col] = df.apply(lambda r:flatten_dict(json.loads(r['hypers'])).get(col), axis=1)
#df['nb_filters']

In [None]:
cols = colnames
x_df = df[cols]
y_df = df['label'].copy()

"""
y_df[y_df == 'excellent'] = '+'
y_df[y_df == 'good'] = '+'
y_df[y_df == 'okay'] = '+'
y_df[y_df == 'bad'] = '-'
y_df[y_df == 'very_bad'] = '-'
"""

x_df = pd.get_dummies(x_df, columns=cols)
x_df.head()

In [None]:
X = x_df.values
y = y_df.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
models = {
   'EarthOneVsRestClassifier': EarthOneVsRestClassifier(max_degree=2, max_terms=10, 
                  smooth=False, thresh=0, minspan=1, 
                  check_every=1,
                  verbose=0,
                  feature_importance_type='rss',
                  endspan=1),
    'RandomForestClassifier': RandomForestClassifier(max_depth=20, n_estimators=10),
    'LogisticRegression': LogisticRegression(),
    'DummyClassifier': DummyClassifier(),
}
result = {}
cols = defaultdict(list)
for name, m in models.items():
    models, train, valid = evaluate(m, X_train, y_train, score=acc)
    cols['model'].append(m.__class__.__name__)
    cols['train_mean'].append(train.mean())
    cols['train_std'].append(train.std())
    cols['valid_mean'].append(valid.mean())
    cols['valid_std'].append(valid.std())
    #cols['test'].append
    result[name] = models
#earth = models[0]
#earth.fit(X, y)
results = pd.DataFrame(cols)
results = results.sort_values(by='valid_mean', ascending=False)
results

In [None]:
print(classification_report(best_model.predict(X_test), y_test))

In [None]:
sns.countplot(y_df)

In [None]:
best_model = result[results.iloc[0]['model']][0]
acc(best_model, X_test, y_test)

In [None]:
print(classification_report(best_model.predict(X_test), y_test))