### 随机追踪（Random Persuit）

In [1]:
import random
import warnings
import numpy as np
from sklearn import linear_model
from multiprocessing import Pool
from functools import partial

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

thread_num = 6

In [3]:
def random_pursuit(data_x, data_y, inx_h, l_inx, N, inx_p = -1, n_pred = 1, s_size = -1, percent_value = -1, method = 'inverse'):
    """random pursuit core function.
    # Arguments
        data_x : A ndarray, a series x value.
        data_y : A ndarray, corresponding series of y value.
        inx_h: Integer, max index of history data.
        l_inx: Integer, length of history data used for pursuit. Or say, the size of sliding window.
        N: Interger, number of points in each regression.
        inx_p: Integer, the index of the data point to predict. If set to -1, the point with index (inx_h+1) will be predicted.
        n_pred: Interger, number of points to predict with sliding window.
        s_size: Interger, the moving step size of sliding window.
        percent_value: Float, max percentage for weight recaculation.
        method: String, 'inverse' or 'softmax', how weights are calculated from losses
    # Returns
        pred: Prediction value for the data point you sepecified.
        forward_loss: Prediction forward L2 loss.
        backward_loss: Prediction backward L2 loss.
    # Raises
        ValueError: Input error OR calculation error.
    """

    if len(data_x) == 0 or len(data_y) == 0 or len(data_x) < len(data_y): # len(data_x) can be bigger than len(data_y), which means you can have some x for prediction, you don't know there corresponding y value.
        raise ValueError('Input data is invalid, Exit.')
    if inx_p >= len(data_x) or (inx_p == -1 and inx_h + 1 >= len(data_x)): # Make sure data_x[inx_h + 1] exits
        raise ValueError('inx_p provided is out of the bound of data_x, Exit.')
    if inx_h - l_inx + 1 < 0: # Make sure data_x[inx_h - l_inx + 1] exits
        raise ValueError('l_inx provided is out of the bound of data_x, Exit.')	
    if inx_p == -1: # If inx_p is set to -1, the point with index (inx_h + 1) will be predicted
        inx_p = inx_h + 1
    if inx_p + n_pred - 1 >= len(data_x): # Make sure data_x[inx_p + n_pred - 1] exits
        raise ValueError('n_pred provided is out of the bound of data_x, Exit.')


    pool = Pool(thread_num)
    n_lines = len(data_x) // N # TODO: if cannot be completely divided
    history_inx = list(range(inx_h - l_inx + 1, inx_h + 1))
    random.shuffle(history_inx)
    shuffle_inx = np.resize(np.array(history_inx), (N, n_lines)) # Generate a shuffled ndarray of size: N * n_lines
    print("Shuffled inx is: ")
    print(shuffle_inx)
    print("====================================================")

    if s_size > 0 and n_pred > 0: # Do slide prediction
        n_iter = n_pred
    else: # Predict once
        n_iter = 1
    all_iter = n_iter

    reg_results = []
    pred_results = {}

    while n_iter > 0:
        slide_time = all_iter - n_iter
        print("... ... This is iteration %s ... ..." % (slide_time + 1))
        print("====================================================")
        if slide_time == 0:	# The 1st time, run all n_lines regressions
            # 1. Parallely perform Linear regression
            train_sets = [(data_x[list(shuffle_inx[i])], data_y[list(shuffle_inx[i])], i) for i in range(n_lines)]
            if len(data_y) <= inx_p:
                test_set = (data_x[inx_p], None)
            else:
                test_set = (data_x[inx_p], data_y[inx_p])
            func = partial(fit_linear_model, test_set)
            reg_results = pool.map(func, train_sets)
            print("Regression result - inx, (regr.coef_[0,0], regr.intercept_[0]), regr._residues[0], pred_y[0,0], pred_resid:")
            print(reg_results) # It looks like: [(0, (0.99, 2.00), 2.07e-12, 51.00, None), (...), (...)]
            print("====================================================")

            # 2. Calculate weights using forward_losses
            forward_losses = [reg_result[2] for reg_result in reg_results]
            weights = calculate_weight(forward_losses, method)
            print("Weights are %s" % str(weights))

            # 3. Final prediction using weighted avg
            preds = [reg_result[3] for reg_result in reg_results]
            pred = np.average(preds, weights = weights)
            pred_results[slide_time] = pred
            print("Prediction is %f" % pred)
            print("====================================================")

        else: # Window slides, update previous reg_results, forward_losses, weights and pred
            # 1. Check if there are history data available for new iteration
            if inx_h + 1 >= len(data_x):
                raise ValueError('After window slides, and inx_h is out of the bound of data_x, Exit.')
            if inx_h + 1 >= len(data_y):
                raise ValueError("WARN: After window slides, no real y value available for index inx_h.") # TODO: may use predicted y value here

            # 2. Slide the window and refactor shuffle_inx
            inx_in = inx_h
            inx_out = inx_h - l_inx
            print("After window slides, point at %s is involved, point at %s is eliminated" % (inx_in, inx_out))
            print("====================================================")
            inx_replace = np.where(shuffle_inx == inx_out)
            shuffle_inx[inx_replace] = inx_in
            print("After window slides, Shuffled inx changes to: ")
            print(shuffle_inx)
            print("====================================================")

            # 3. Perform Linear regression only for new lines and update previous reg_results
            changed_lines = inx_replace[0].flatten()
            previous_results = reg_results
            train_sets = [(data_x[list(shuffle_inx[i])], data_y[list(shuffle_inx[i])], i) for i in changed_lines]
            if len(data_y) <= inx_p:
                test_set = (data_x[inx_p], None)
            else:
                test_set = (data_x[inx_p], data_y[inx_p])
            func = partial(fit_linear_model, test_set)
            reg_results = pool.map(func, train_sets)
            print("Regression result - inx, (regr.coef_[0,0], regr.intercept_[0]), regr._residues[0], pred_y[0,0], pred_resid:")
            print(reg_results) # It looks like: [(0, (0.99, 2.00), 2.07e-12, 51.00, None), (...), (...)]
            print("====================================================")
            changed_inx = [e[0] for e in previous_results].index(changed_lines)
            previous_results[changed_inx] = reg_results[0] # TODO: hard coding to get the 1st item in reg_results. If predict multiple items will get error
            print("Refactor Previous Regression result to: ")
            print(previous_results)
            print("====================================================")

            # 4. Calculate weights using forward_losses
            forward_losses = [reg_result[2] for reg_result in previous_results]
            weights = calculate_weight(forward_losses, method)
            print("Weights are %s" % str(weights))

            # 5. Final prediction using weighted avg
            preds = [reg_result[3] for reg_result in previous_results]
            pred = np.average(preds, weights = weights)
            pred_results[slide_time] = pred
            print("Prediction is %f" % pred)
            print("====================================================")
            reg_results = previous_results # Store this previous_results for next iteration

        # Update iterators
        inx_h += 1
        inx_p += 1
        n_iter -= 1

    # Close the pool
    pool.close()
    return pred_results

In [4]:
def fit_linear_model(test_set, train_set):
    regr = linear_model.LinearRegression()
    train_x = np.resize(train_set[0], (-1, 1))
    train_y = np.resize(train_set[1], (-1, 1))
    inx = train_set[2] # The uid of this line, corresponding to its row id in shuffle_inx
    test_x = np.array(test_set[0]).reshape(-1, 1) 
    test_y = np.array(test_set[1]).reshape(-1, 1) 
    regr.fit(train_x, train_y) # training
    pred_y = regr.predict(test_x) # predict - test_x should be 2D
    if test_y != None: # If know the real y value
        pred_resid = (pred_y - test_y) ** 2
    else: # If do not know the real y value
        pred_resid = None
    # A thread (a line) returns something like this: (0, (0.99, 2.00), 2.07e-12, 51.00, None)
    return inx, (regr.coef_[0,0], regr.intercept_[0]), regr._residues[0], pred_y[0,0], pred_resid 

In [5]:
def calculate_weight(forward_losses, method='inverse'):
    # calulate inverse
    forward_losses = np.array(forward_losses)
    if method == 'inverse':
        inverse_losses = (1.0 + 1e-50) / (forward_losses + 1e-50) # avoid divided by 0
        weights = scale_one(inverse_losses) # scale to one
    elif method == 'softmax':
        weights = np.exp(forward_losses) / np.sum(np.exp(forward_losses), axis=0)
    else:
        raise ValueError('Weight calculation method is invalid, Exit.')
    return weights

In [6]:
def scale_one(x):
    return x / np.sum(x, axis=0)

In [7]:
############################
##          Main          ##
############################
if __name__ == '__main__':
    data_x = np.arange(55)
    data_y = np.arange(54) + 2.0
    inx_h, l_inx, N, n_pred, s_size = 48, 49, 7, 3, 1
    method = 'softmax'
    pred_results = random_pursuit(data_x, data_y, inx_h, l_inx, N, -1, n_pred, s_size, -1, method)
    print(pred_results)

Shuffled inx is: 
[[44 13 23 32 34 15 39]
 [38 45 36 43  0 40 10]
 [29  5 35 22  7 30 48]
 [17 14  3 28 24  4 42]
 [37  8 31 27  6 19 33]
 [18 11 25 16 21 26 41]
 [ 9 46  1  2 20 12 47]]
... ... This is iteration 1 ... ...
Regression result - inx, (regr.coef_[0,0], regr.intercept_[0]), regr._residues[0], pred_y[0,0], pred_resid:
[(0, (0.9999999999999999, 2.0000000000000036), 7.099748146989106e-30, 51.0, array([[0.]])), (1, (1.0, 2.0), 5.383975678133406e-29, 51.0, array([[0.]])), (2, (1.0000000000000004, 1.9999999999999893), 3.1603740015416785e-29, 51.000000000000014, array([[2.01948392e-28]])), (3, (0.9999999999999999, 2.0000000000000018), 2.2137409152764644e-29, 50.99999999999999, array([[5.04870979e-29]])), (4, (1.0000000000000004, 1.9999999999999893), 7.888609052210118e-31, 51.000000000000014, array([[2.01948392e-28]])), (5, (0.9999999999999998, 2.0000000000000036), 4.979684464207637e-30, 50.999999999999986, array([[2.01948392e-28]])), (6, (1.0000000000000002, 1.9999999999999964), 1