### Avoid hard-coding file path
* Save the effort of typing long path. 
* Save the effort of fooling around the path convention in different operating system. 

In [4]:
%%writefile filepath.py
import os
from flask import Flask

basedir = os.path.abspath(os.path.dirname(__file__))
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///' + os.path.join(basedir, 'data.sqlite')
print(basedir)
print('sqlite:///' + os.path.join(basedir, 'data.sqlite'))

Overwriting filepath.py


In [5]:
!python filepath.py

Traceback (most recent call last):
  File "filepath.py", line 2, in <module>
    from flask import Flask
ModuleNotFoundError: No module named 'flask'


In [1]:
If I run the cell above directly without %%writefile filepath.py, and running 'python filepath.py', then the __file__ will be undefined. 

Writing filepath.py


### Split train and test data from skewed and multi-label dataset 
* Normally for non-skewed and single-label data, we can use for example stratify = y option in scikit-learn functions to split y according to a predetermined ratio. However, for multi-label and skewed data set, it is not easily to make sure rare data to appear in both the training and the test sets. 

* The following function multilabel_train_test_split() is developed to handle the case above. The code is from the course "Machine Learning with the Experts School Budgets" of DataCamp. 

In [None]:
from warnings import warn

import numpy as np
import pandas as pd

def multilabel_sample(y, size=1000, min_count=1, seed=None): #min_count = 5 is temporarily changed. Same for below
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of
        each label.
    """
    try:
        if (np.unique(y).astype(int) != np.array([0, 1])).all():
            raise ValueError()
    except (TypeError, ValueError):
        raise ValueError('multilabel_sample only works with binary indicator matrices')

    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')

    if size <= 1:
        size = np.floor(y.shape[0] * size)

    if y.shape[1] * min_count > size:
        msg = "Size less than number of columns * min_count, returning {} items instead of {}."
        warn(msg.format(y.shape[1] * min_count, size))
        size = y.shape[1] * min_count

    rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))

    if isinstance(y, pd.DataFrame):
        choices = y.index
        y = y.values
    else:
        choices = np.arange(y.shape[0])

    sample_idxs = np.array([], dtype=choices.dtype)

    # first, guarantee > min_count of each label
    for j in range(y.shape[1]):
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

    sample_idxs = np.unique(sample_idxs)

    # now that we have at least min_count of each, we can just random sample
    sample_count = int(size - sample_idxs.shape[0])

    # get sample_count indices from remaining choices
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices,
                                   size=sample_count,
                                   replace=False)

    return np.concatenate([sample_idxs, remaining_sampled])


def multilabel_sample_dataframe(df, labels, size, min_count=1, seed=None):
    """ Takes a dataframe `df` and returns a sample of size `size` where all
        classes in the binary matrix `labels` are represented at
        least `min_count` times.
    """
    idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
    return df.loc[idxs]


def multilabel_train_test_split(X, Y, size, min_count=1, seed=None):
    """ Takes a features matrix `X` and a label matrix `Y` and
        returns (X_train, X_test, Y_train, Y_test) where all
        classes in Y are represented at least `min_count` times.
    """
    index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])

    test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
    train_set_idxs = np.setdiff1d(index, test_set_idxs)

    test_set_mask = index.isin(test_set_idxs)
    train_set_mask = ~test_set_mask

    return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])

numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000)

label_dummies = pd.get_dummies(df[LABELS])

X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only,
                                                               label_dummies,
                                                               size=0.2, 
                                                               seed=123)

### DataFrame tricks

In [None]:
data_frame.columns.tolist() # transform dataframe columns to list
data_frame.apply(lambda x: " ".join(x), axis=1) # Join all text items in a row that have a space in between

### itertools.chain vs itertools.chain.from_iterable(iterables) 
* The first takes 0 or more arguments, each an iterable, the second one takes one argument which is expected to produce the iterables. But iterables can be any iterator that yields the iterables.  

In [None]:
itertools.chain(list1, list2, list3)

iterables = [list1, list2, list3]
itertools.chain.from_iterable(iterables)

In [None]:
def generate_iterables():
    for i in range(10):
        yield range(i)

itertools.chain.from_iterable(generate_iterables())

Using the second form is usually a case of convenience, but because it loops over the input iterables lazily, it is also the only way you can chain a infinite number of finite iterators.

### Plot decision boundary
This function is from the deep learning course for planar data classification.  

https://stackoverflow.com/questions/10894323/what-does-the-c-underscore-expression-c-do-exactly 
About np.c_ function, better explanation than the official document

**Remove the testing code when applying this function**.  

In [None]:
def plot_decision_boundary(pred_func, X, Y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0,:].min() - .5, X[0,:].max() + .5
    y_min, y_max = X[1,:].min() - .5, X[1,:].max() + .5
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    # Predict the function value for the whole gid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    
    #print('s',Z)
    Z = Z.reshape(xx.shape)
    print(xx.shape)
    print(yy.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap = 'summer') #cmap=plt.cm.Spectral
    plt.scatter(X[0,:], X[1,:], c=np.squeeze(Y), cmap=plt.cm.Spectral)
    
clf = sklearn.linear_model.LogisticRegressionCV()
clf.fit(X.T, np.squeeze(Y))

plot_decision_boundary(lambda x: clf.predict(x), X, Y)
plt.title("Logistic Regression")
    

### Verifying data type, array or DataFrame shape

In [None]:
import numpy as np
b = 0.5
assert (isinstance(b, float))

assert (W1.shape == (n_h, n_x))