# Task Sequencer

This tool will calculate the optimal order to attempt tasks in a project where each task has some probability of failing.

When we are uncertain about the feasibility of a project, we should strive to get as much information about the difficulty as quickly as possible. This suggests that we should start with tasks with a high failure rate.

For a deeper discussion of this approach and related techiques see <a href="https://cs.stanford.edu/~jsteinhardt/ResearchasaStochasticDecisionProcess.html">Research as a Stochastic Decision Process</a>.


In [47]:
import ipywidgets
import ipysheet
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from itertools import permutations
from IPython.display import Markdown, display, clear_output, FileLink
from symbulate import RV, Exponential
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['lines.linewidth'] = 2.0



def printmd(string):
    display(Markdown(string))

def df_from_sheet(sheet):
    '''Converts the sheet to a compact dataframe'''
    df = ipysheet.to_dataframe(sheet)
    df = fill_blanks(df)
    df = trim_df(df)
    df = switch_from_names(df)
    df = nums_to_float(df)
    df = switch_to_names(df)
    return df

def switch_to_names(df):
    '''Converts indicies in the dependency cols
    to their corresponding task names'''
    df.iloc[:,3:] = df.iloc[:,3:].replace(
        row_to_name_dict(df))
    return df

def switch_from_names(df):
    '''Converts the task names in the dependency 
    cols to their coresponding indicies'''
    df.iloc[:,3:] = df.iloc[:,3:].replace(
        name_to_row_dict(df))
    return df

def name_to_row_dict(df):
    return dict(zip(df['Task'], df.index + 1))

def row_to_name_dict(df):
    return dict(zip(df.index + 1, df['Task']))

def fill_blanks(df):
    '''Replaces NaN and empty values with 0s'''
    df.fillna(0, inplace=True)
    df.replace(["", "0"], 0, inplace=True)
    return df

def trim_df(df):
    '''Removes rows and columns that only contain zeros'''
    nonzero_shape = [0, 0] # store dims of portion of df w/out 0 rows/cols
    for index, axis in enumerate((1,0)):
        zero_bool = (df.values==0).all(axis=axis)
        zero_locs = np.where(zero_bool)[0]
        if zero_locs.size == 0:
            nonzero_shape[index] = df.shape[index] - 1 
            # -1 b/c using nonzero_shape to index
        else:
            nonzero_shape[index] = zero_locs[0]
    return df.iloc[:nonzero_shape[0], :nonzero_shape[1]]

def nums_to_float(df):
    '''Converts all numerical entries
    to floats'''
    df.iloc[:,1:] = df.iloc[:,1:].astype(float)
    return df

def build_sheet(task_report="task_sheet.csv",
               rows=10, cols=10):
    '''Creates the task dependency sheet'''
    labels = (['Task', 'Time', 'Probability'] 
              + ["D" + str(i + 1) 
                 for i in range(cols - 3)])
    sheet = sheet_from_df(task_report, rows, 
                          cols, labels)
    if sheet:
        return sheet
    sheet = sheet_from_scratch(rows, cols, labels)
    return sheet

def sheet_from_df(task_report, rows, 
                  cols, labels):
    '''Returns a dataframe from task_report if
    task_report is in the working directory. Otherwise
    returns None.'''
    try:
        sheet = ipysheet.sheet(
            rows=rows,
            columns=cols,
            column_headers=labels)
        df = pd.read_csv(task_report)
        df = rid_zeros(df)
        data = np.empty([rows, cols], dtype=object)
        data[:df.shape[0], :df.shape[1]] = df.values
        ipysheet.cell_range(data)
        return sheet
    except OSError:
        return None

def rid_zeros(df):
    '''Converts the task names  in the dependency 
    cols to their coresponding indicies'''
    df.iloc[:,3:] = df.iloc[:,3:].replace({'0.0' : None})
    return df
    
    
def sheet_from_scratch(rows, cols, labels):
    '''Generates a sheet with all empty values'''
    sheet = ipysheet.sheet(
        rows=rows,
        columns=cols,
        column_headers=labels)
    data = [['' for i in range(cols)] 
            for i in range(rows)]
    ipysheet.cell_range(data)
    return sheet

def gen_perms(df):
    '''Creates a list of all permutations of the
    rows of the dataframe (indexing starting at 1)'''
    rows=df.shape[0]
    return permutations(range(1,rows+1))

def validate_perms(perms, deps):
    '''Returns the rows in perms which dont
    violate a dependency'''
    valid_perms = []
    for perm in perms:
        if check_perm(perm, deps) == True:
            valid_perms.append(perm)
    return valid_perms

def check_perm(perm, deps):
    '''Returns True or False according to whether
    perm violates a dependency'''
    prev_ind = [0]
    for index in perm:
        prev_ind.append(index)
        dep = deps[index-1, :]
        if not all(elem in prev_ind for elem in dep):
            return False
    return True

def sort_rates(times, probs):
    '''Returns an order list of the
    failure rates (called lambda in the
    appendix)'''
    return (np.flip(np.argsort(
        1/times*np.log(1/(1-probs)))) + 1)
        
def get_expected_times(perms, df):
    '''Calculates the expected time for each
    order of the tasks according to the
    equation in appendix A'''
    sums = []
    for perm in perms:
        prob_col = df['Probability'][np.array(perm)-1].values.astype(float)
        prob_comp = 1 - prob_col
        prob_mat = expand_to_lower_tri_mat(prob_comp)
        prod_arr = prob_prod_arr(prob_mat, prob_col)
        time_col = df['Time'][np.array(perm) - 1].values.astype(float)
        time_mat = expand_to_lower_tri_mat(time_col)
        sum_arr = sum_times(time_mat)
        sums.append(expected_time_eq(prod_arr,
                                     sum_arr, time_col,
                                     prob_col, prob_comp))
    return sums


def expected_time_eq(prod_arr, sum_arr, time_col, prob_col,
                     prob_comp):
    prob_succ = np.prod(prob_comp)
    total_time = np.sum(time_col)
    exp_time_given_fail_on_task = (sum_arr
                                   - time_col/prob_col
                                   + time_col/np.log(1/(1 - prob_col))
                                   + time_col)
    prob_fail_on_task = prod_arr
    return (prob_succ*total_time
           + np.sum(prob_fail_on_task*exp_time_given_fail_on_task))



def expand_to_lower_tri_mat(arr):
    '''Converts arr to a lower triangular matrix.
    e.g. [1,2,3] => [[1,0,0]
                     [1,2,0]
                     [1,2,3]]'''
    two_d_arr = np.expand_dims((arr), 0)
    square_mat = np.repeat(two_d_arr,
                             repeats=arr.shape[0],
                             axis=0)
    return np.tril(square_mat)
    
def prob_prod_arr(prob_mat, prob_col):
    '''Returns the product array that will satisfy the
    equation in the appendix e.g. 
    prob_mat = [[0.8, 0.0, 0.0],    prob_col = [0.2, 0.3, 0.4]
                [0.8, 0.7, 0.0],
                [0.8, 0.7, 0.6]]
    => prod_arr = column product of( [[0.8, 0.0, 0.0], 
                                     [0.8, 0.7, 0.0],
                                     [0.8, 0.7, 0.6]]                    
                                    +[[0.2, 0.0, 0.0],
                                      [0.0, 0.3, 0.0],
                                      [0.0, 0.0, 0.4]] 
                                    +[[0.0, 1.0, 1,0],
                                      [0.0, 0.0, 1.0],
                                      [0.0, 0.0, 0.0]] )
      which equals [0.2  , 0.24 , 0.224]'''                              
    np.fill_diagonal(prob_mat, 0)
    ones = np.triu(np.ones(np.shape(prob_mat)))
    np.fill_diagonal(ones, 0)
    return np.prod(ones + prob_mat + np.diag(prob_col), 1)

def sum_times(time_mat):
    '''Returns an array of sums from
    a lower triangular matrix after zeroing
    diagonal'''
    np.fill_diagonal(time_mat, 0)
    return np.sum(time_mat, 1)

def present_task_names(arr, df):
    '''Creates DataFrame of task names from task rows'''
    name_list = []
    row_to_name = row_to_name_dict(df)
    for row in arr:
        name_list.append(row_to_name[row])
    return pd.DataFrame(name_list, columns=["Task"])

    
def plot_fig(sorted_sums, hist_vals):
    fig, axs = plt.subplots(1, 2, figsize=(20, 8))
    axs[0].plot(np.arange(len(sorted_sums)), sorted_sums, color='black', ls='--')
    axs[0].set_xticks([])
    axs[0].set_ylabel('Time')
    axs[0].set_title('Expected time with respect to \n order of tasks')
    axs[1].hist(hist_vals, bins=30)
    axs[1].set_title('Approximate distribution of occurence of failure')
    axs[1].set_xlabel("Time")
    axs[1].set_yticks([])
    plt.show()

def simulate_failures(df, index):
    totals = []
    prob = df['Probability'].values.astype(float)[index]
    time = df['Time'].values.astype(float)[index]
    lambdas = get_lambdas(time, prob)
    success_count = 50000
    prior_task_time = 0
    for j, rate in enumerate(lambdas):
        trials = RV(Exponential(rate=rate)).sim(success_count)
        failed = trials.filter_leq(time[j])
        totals.append(np.array(list(failed)) + prior_task_time)
        success_count = len(trials) - len(failed)
        prior_task_time += time[j]
    totals = np.hstack(totals)
    return totals

def get_lambdas(time, prob):
    return 1/time*np.log(1/(1 - prob))

def main(df):
    df = switch_from_names(df)
    deps = df.filter(like='D').values
    times = df['Time'].values.astype(float)
    probs = df['Probability'].values.astype(float)
    perms = gen_perms(df)
    valid_perms = validate_perms(perms, deps)
    sums = get_expected_times(valid_perms, df)
    sorted_sums = np.sort(sums)
    best_order = valid_perms[np.argmin(sums)]
    best_df = present_task_names(best_order, df)
    best_index = best_df.replace(
        name_to_row_dict(df)).values.flatten() - 1
    hist_vals = simulate_failures(df, best_index)
    printmd("## Report")
    printmd("##### The optimal order is ")
    display(best_df)
    printmd("##### The expected time you will work on this project is")
    printmd("##### " + str(sorted_sums[0]))
    printmd("##### and the probability you will succeed is")
    printmd("##### " + str(np.prod(1 - probs)) + ".")
    plot_fig(sorted_sums, hist_vals)

## Data Entry

How it works:
1. If you have previously used this app to build a sheet and you have the file saved, you can pick up where you left off by uploading it. Simply upload the file and click Build Sheet.
2. To start with a blank sheet, just click Build Sheet.
3. Fill the Tasks column with short names for each task, the Time column with your estimation for how long each task will take, and the Probability column with your estimation of the probability that each task will fail.
    1. Probabilities need to be numbers strictly greater than 0 and less than 1!
4. Let's say you put task A in row 1. You would then put the names of any tasks that you must complete before you can attempt task A in the D columns on row 1.
5. Click run to get a full report!
6. Click the link at the bottom of the report to download your sheet.

In [13]:
myupload = ipywidgets.FileUpload(accept='.csv', multiple=False)
display(myupload)

FileUpload(value={}, accept='.csv', description='Upload')

In [60]:
sheet_butt = ipywidgets.Button(description='Build Sheet')
out = ipywidgets.Output()
sheet = build_sheet()

def sheet_butt_func(_):
    global sheet
    with out:
        try:
            clear_output()
            uploaded_filename = next(iter(myupload.value))
            content = myupload.value[uploaded_filename]['content']
            with open('task_sheet.csv', 'wb') as f: f.write(content)
            sheet = build_sheet()
            display(sheet)
        except:
            sheet = build_sheet()
            display(sheet)
sheet_butt.on_click(sheet_butt_func)
display(ipywidgets.VBox([sheet_butt,out]))

VBox(children=(Button(description='Build Sheet', style=ButtonStyle()), Output()))

In [61]:
run = ipywidgets.Button(description='Run')
out = ipywidgets.Output()
def run_program(_):
    with out:
        df = df_from_sheet(sheet)
        df.to_csv('task_sheet.csv', index=False)
        local_file = FileLink('./task_sheet.csv', result_html_prefix="Click here to download the sheet: ")
        return main(df), display(local_file)
        
run.on_click(run_program)
buttons = ipywidgets.HBox([run])
ipywidgets.VBox([buttons,out])

VBox(children=(HBox(children=(Button(description='Run', style=ButtonStyle()),)), Output()))

## Appendix A

We begin by letting $T$ be a random variable representing the time we spend on the project, from the begining to the point where we either succeed or fail. Our goal is to minimize $\text{E}[T]$.

Let $L$ be a random variable indicating the task we fail on. We define $L$ as

$$
L = \begin{cases} 
     i & \text{we fail on task }i \\
     0 & \text{we succeed}. \\ 
   \end{cases}
$$

We can now invoke the law of total expectation.

$$
\text{E}[T] = \text{E}[\text{E}[T \vert L]] = \text{P}(L = 0)\text{E}[T \vert L = 0] + \text{P}(L = 1)\text{E}[T \vert L = 1] + \dots + \text{P}(L = n)\text{E}[T \vert L = n].
$$

The user supplies us with their estimates of the probability of failure on each task, and their estimate for the time each task will take. We denote these as $p_1, \dots p_n$ and $t_1, \dots t_n$. For our purposes these are constant known values. It follows that

$$
\text{P}(L = 0) = (1 - p_1) \dots (1 - p_n) = \prod_{i = 1}^n (1 - p_i) \text{ and}
$$
$$
\text{E}[T \vert L = 0] = t_1 + \dots t_n = \sum_{i = 1}^n t_i.
$$
Take $ i \in [n]$. It's easy to see that
$$
\text{P}(L = i) = (1 - p_1) \dots (1 - p_{i - 1}) p_i = \prod_{k = 1}^ {i - 1} (1 - p_k)p_i.
$$

It is not obvious what $\text{E}[T \vert L_{i}]$ should be, and that is because we lack enough information about the distribution of $T$. We have to make an assumption, and the one we choose is that the probability of failing during the next minute of doing a task is independent of how long we have been doing the task (see <a href = https://cs.stanford.edu/~jsteinhardt/ResearchasaStochasticDecisionProcess.html>Research as a Stocastic Decision Process</a>). This implies that $f_T$ is given by

$$
f_T(t) = \begin{cases}
            \lambda_1 e^{-\lambda_1 t} & 0 \leq t < t_1 \\
            (1 - p_1) \lambda_2 e^{-\lambda_2 (t - t_1)} & t_1 \leq t < t_1 + t_2 \\
            \vdots & \vdots \\
           (1 - p_{i - 1}) \dots (1 - p_1) \lambda_i e^{-\lambda_i( t -  t_1 - \dots - t_{i - 1})} & t_1 + \dots + t_{i - 1} \leq t < t_1 + \dots + t_{i} \\
            \vdots & \vdots \\
            (1 - p_{n}) \dots (1 - p_{i}) \dots (1 - p_1) \lambda_n e^{-\lambda_n (t - t_1 - \dots -t_{n - 1})} & t_1 + \dots + t_{i} + \dots + t_{n - 1} \leq t < t_1  + \dots + t_{i} + \dots + t_{n},
       \end{cases}
$$

where $\lambda_{i}$ is the failure rate of task $i$ given by
$$
\lambda_{i} = \frac{1}{t_i}\ln \Big{(}\frac{1}{1 - p_{i}}\Big{)}.
$$

From this we have
$$
f_{T \vert \{L = i\}}(t) = \frac{f_T(t)}{\text{P}(L = i)} = \frac{(1 - p_{i - 1}) \dots (1 - p_1) \lambda_i e^{-\lambda_i t -  t_1 - \dots - t_{i - 1}}}{(1 - p_{i - 1}) \dots (1 - p_1)p_i} = \frac{ \lambda_i e^{-\lambda_i t -  t_1 - \dots - t_{i - 1}}}{p_i} \text{ when } t_1 + \dots + t_{i - 1} \leq t < t_1 + \dots + t_{i}.
$$

Let $t_0 = t_1 + \dots + t_{i - 1}$. By performing a change of variables $ t \rightarrow t - t_0$ we have
$$
\text{E}[T \vert L = i] = \frac{1}{p_i} \int_0^{t_i} (t + t_0) \lambda_i e^{-\lambda_i t} dt,
$$

which after some simplification yields

$$
\text{E}[T \vert L = i] = \frac{1 + \lambda_i t_0 - e^{- \lambda_i t_i } - \lambda_i e^{ -\lambda_i t_i } (t_0 + t_i)}{ \lambda_i p_i}.
$$

Substituting in our value for $\lambda_i$ and simplifying we have

$$
\text{E}[T \vert L = i] = t_0 - \frac{t_i}{p_i} + \frac{t_i}{\ln \big{(} \frac{1}{1 - p_i} \big{)} } + t_i.
$$

Combining this with what we found for $\text{P}(L = i)$ and recalling that $t_0 = \sum_{j = 1}^{i - 1} t_j$ we have

$$
\text{P}(L = i)\text{E}[T \vert L = i] =  \prod_{k = 1}^ {i - 1} (1 - p_k)p_i \Bigg(\sum_{j = 0}^{i - 1} t_j - \frac{t_i}{p_i} + \frac{t_i}{\ln \big{(} \frac{1}{1 - p_i} \big{)} } + t_i \Bigg).
$$

Thus, our final expression for $ \text{E}[T]$ is

$$
\text{E}[T] = \prod_{i = 1}^n (1 - p_i)\sum_{i = 1}^n t_i + \sum_{i = 1}^n \Bigg( \prod_{k = 1}^ {i - 1} (1 - p_k)p_i \Bigg(\sum_{j = 0}^{i - 1} t_j - \frac{t_i}{p_i} + \frac{t_i}{\ln \big{(} \frac{1}{1 - p_i} \big{)} } + t_i \Bigg) \Bigg).
$$