## 110-2 Operations Research Case 2

Algorithm 1: based on Latest Start Time 

## Loading data

In [15]:
import pandas as pd 
import numpy as np

In [16]:
# !git clone https://github.com/DanielStutzbach/heapdict

In [17]:
from heapdict import heapdict
# see https://github.com/DanielStutzbach/heapdict
# or directly !pip install HeapDict
Q = heapdict()

In [18]:
datadir = './data'
instances = []
for i in range(5):
    name = f'instance_{i+1}.csv'
    fullpath = datadir+'/'+name
    instances.append(pd.read_csv(fullpath))

In [386]:
df

Unnamed: 0,Job ID,Stage-1 Processing Time,Stage-2 Processing Time,Stage-1 Machines,Stage-2 Machines,Due Time
0,1,2.7,1.3,12345,2345.0,5
1,2,1.6,1.4,2345,12345.0,5
2,3,0.7,1.9,12345,2345.0,5
3,4,0.5,0.7,2345,12345.0,5
4,5,0.8,1.0,12345,2345.0,5
5,6,2.5,0.0,12345,,5
6,7,1.4,2.0,2345,2345.0,5
7,8,1.1,1.1,2345,2345.0,10
8,9,0.8,0.7,12345,2345.0,10
9,10,1.0,0.5,2345,12345.0,10


## Data Structures

In [372]:
# job structure 
class Job:
    '''structure for 1 job '''
    def __init__(self, row):
        '''input := df.iloc[idx, :]'''
        self.id = row['Job ID']
        self.due = row['Due Time']
        self.next_op = 0 # True as complete, False as not yet processed
        self.stage_pt = [row['Stage-1 Processing Time'], row['Stage-2 Processing Time']]
        mfor1 = list(map(int, row['Stage-1 Machines'].split(',')))
        if row['Stage-2 Machines'] is not np.nan:
            mfor2 = list(map(int, row['Stage-2 Machines'].split(',')))
        else: mfor2 = [] 
        self.stage_mach = [mfor1, mfor2]
        self.assign_mach = [None for _ in range(2)]
        self.start_time = [-1 for _ in range(2)]
        self.end_time = [-1 for _ in range(2)]
    
    def __repr__(self):
        return f'\
          * Job id: {self.id}\n\
          * Due time:{self.due}\n\
          stage 1: {self.assign_mach[0]}\n\
                   {self.stage_pt[0]}, {self.stage_mach[0]}\n\
          stage 2: {self.assign_mach[1]}\n\
                   {self.stage_pt[1]}, {self.stage_mach[1]}'
    __str__ = __repr__
        

In [470]:
class Jobs:
    '''structure for multiple jobs' management'''
    def __init__(self, n):
        self.completion_times = np.zeros(n)
        self.tardiness = np.zeros(n)
        self.is_completed = np.full(n, False)
        self.residual_times = np.zeros(n)
        self.jobs = []
        
        
    def get_RRDD(self):
        if getattr(self, 'RRDD', None) is None:
            self.RRDD = self.due_dates - np.min(self.due_dates)
        return self.RRDD # static

    def get_LST(self):
        '''latest start time'''
        self.LST = self.due_dates - self.residual_times
        return self.LST
    
    def add_jobs(self, data):
        self.due_dates = data['Due Time'].to_numpy()
        for i in range(len(data)):
            row = data.iloc[i, :]
            jobi = Job(row)
            self.residual_times[i] = sum(jobi.stage_pt)
            self.jobs.append(jobi)
            
    
    def assign(self, job_name, mach, st):
        '''job_name = (2, 0) means job 3 and op 1
        note that job and op is 0-indexed as well as machines
        op
        '''
        
        i = 0 
        jobidx, op = job_name 
        job = self.jobs[jobidx]
        J.completion_times[jobidx] = st + job.stage_pt[op]
        J.residual_times[jobidx] -= job.stage_pt[op]
        job.assign_mach[op] = mach
        job.start_time[op] = st
        job.end_time[op] = J.completion_times[jobidx]
        job.next_op = op+1
        if op == 1:
            self.is_completed[jobidx] = True

In [471]:
class Machines:
    def __init__(self, df):
        '''pass the stage1, stage2 machine lists'''
        mfor1 = df['Stage-1 Machines'].values.tolist()
        mfor2 = df['Stage-2 Machines'].values.tolist()
        mfor1 = [list(map(int, x.split(','))) for x in mfor1]
        mfor2 = [list(map(int, x.split(','))) for x in mfor2 if x is not np.nan]
        mfor1 = sum(mfor1, [])
        mfor2 = sum(mfor2, [])
        self.number = max(max(mfor1), max(mfor2))
        self.versatile = [mfor1.count(i+1) + mfor2.count(i+1) for i in range(self.number)]
        self.schedule = [[] for _ in range(self.number)]
        self.span = [[] for _ in range(self.number)]
        self.fintime = [0 for _ in range(self.number)]
        
    def _schedule(self, mach, job_name, st, proc_time):
        '''mach is 0-indexed'''
        display_name = tuple([x+1 for x in job_name])
        self.schedule[mach].append(f'{display_name} {proc_time:.2f}') 
        self.span[mach].append(proc_time)
        self.fintime[mach] = st + proc_time
    
    def add_idle(self, mach, idle_time):
        self.schedule[mach].append(f'idle {idle_time:.2f}') 
        self.span[mach].append(idle_time)
        self.fintime[mach] += idle_time
        

## Which instance to test?

In [496]:
INSTANCEIDX = 0
df = instances[INSTANCEIDX]

## Preprocessing 
Usage:
1. Read the dataframe into `Machines()` as `M`.
2. Init by giving the length of jobs to `Jobs()` as `J`.
    Initialize it by calling `add_jobs()`.
3. Call our heuristic algorithm. 
4. Get the result from `M, J`. No need to return them. 

In [497]:
M = Machines(df)
J = Jobs(len(df))
J.add_jobs(df)
# print(J.residual_times)
# print(J.get_LST())

[4.  3.  2.6 1.2 1.8 2.5 3.4 2.2 1.5 1.5 4.4 2. ]
[1.  2.  2.4 3.8 3.2 2.5 1.6 7.8 8.5 8.5 5.6 8. ]


## Algorithm

#### Warnings:

1. Re-run the code from **preprcoessing section** otherwise the data stuctures will keep accumulating repetitive datas. 

2. All the indexing is 0-indexed for coding convenience, but when storing back to `M.schedule` for displaying purpose, it is changed into 1-indexed. 


#### Steps:

1. ```LST = self.due_dates - self.residual_times ```
    `self.due_dates[j]` is the deadline for job ${j}$.
    `residual_times[j]` is the remaining processing times for job ${j}$.
    `LST[j]` is very similar to the idea of slackness/remaining time for you to procrastinate. 
    The less is LST, the sooner this job is to be scheduled (otherwise it's too late for it to catch up the deadline).
    
    *Note*: there's one little thing that can be optimized: if a job is definitely tardy now, consider associating it with a very big `LST[j]`
    so that it will not block other jobs for catching up their deadlines. 
    However, as I code this feature and test, it makes no difference in terms of our 2 objectives


2. Jobs are put into a priority queue ${Q}$, which are ordered by their associated **LST**.


3. ```Extract_min()``` from ${Q}$ as `curr_job`, and get its corresponding operation.
    That is, if curr_job's first operation is not done yet, do first op;
    otherwise do second op. This value of which op to execute is stored in `job.next_op` attribute.

4. Calculate the best machine to schedule `curr_job`'s `curr_op`.
    The idea is to get its associated subset of machines, 
    and then order them by (1) their current finished times (for the sake of makespan), and then 
    (2) their `versatility`. Versatility is a static vector, `M.versatile[m]` specifies **the number of operations machine ${m}$ is capable of executing.** 
    To sum up, the best machine `curr_machine` is computed by: 
    ```curr_machine = min(avail_machines_idx, key = lambda m: (M.fintime[m], M.versatile[m], m))```
   Then put ```curr_job``` onto this machine.


5. Update values, especially update the completion_times, residual_times and **LST**.
   For debugging purpose, I add time of idleness into M.schedule so that it is easier to read the schedule, calculate times and check feasibility. 


6. Check if all job operations are scheduled, if yes, stop the algorithm, if no, continue the iteration (go back to step 3).


#### Result:

1. `M.fintime` is the finishing time of machines, you can get the makespan by `max(M.fintime)`.
2. `M.schedule` is the schedule of machines, similar to gantt chart. 
3. `J.completion_times` is the completion times for all jobs. Comparing it with `J.due_dates` using
   ```tardies = list(np.where(J.completion_times > J.due_dates)[0])```
   gives you the tardy jobs (it's 0-indexed!!!). Turn it to 1-indexed by 
   ```[x+1 for x in tardies]```. 
   

In [498]:
# Priority Queue structure 
from heapq import heappush, heappop, heapify
Jobs_keys = [(key, job_index) for job_index, key in enumerate(J.get_LST())]
Q = Jobs_keys[:]
heapify(Q)  # sort first by slack, then by index, so it is breaking ties with smallest index 
Q[0]
# curr_job = heappop(Q)
# curr_job

(1.0, 0)

## Algorithm

In [574]:
# while not all operations in all jobs are scheduled

def heuristic(J, M): 
    # step 1. call get_LST()
    Q = [(lst, job_index) for job_index, lst in enumerate(J.get_LST())]
    # step 2. make priority queue 
    heapify(Q)
    # step 6. check if all jobs are completed 
    while not np.all(J.is_completed):
        # step 3. extract_min() to get the job with minimal LST and its other attributes
        _, curr_job_index = heappop(Q)
        curr_job = J.jobs[curr_job_index]
        curr_op = curr_job.next_op
        op_proc_time = curr_job.stage_pt[curr_op]
        job_name = (curr_job_index, curr_op)
         
        avail_machines_idx = [x-1 for x in curr_job.stage_mach[curr_op]]
        # if curr_job has no second operation 
        if op_proc_time <= 0:
            # update job 
            J.assign(job_name = job_name, 
                    mach = -1,
                    st = curr_job.end_time[curr_op-1]) 
            # note that it's only possible for second operation to have proc time = 0
            # so this doesn't trigger index error
            continue 
        # step 4. calculate the best machine
        # find the machine that has the smallest finished time, and then smallest versatility 
        curr_machine = min(avail_machines_idx, key = lambda x: (M.fintime[x], M.versatile[x], x))
        
        # step 5. update values
        if M.fintime[curr_machine] < J.completion_times[curr_job_index]:
            M.add_idle( 
                mach = curr_machine, 
                idle_time = J.completion_times[curr_job_index] - M.fintime[curr_machine])
        
        J.assign(job_name = job_name, 
                mach = curr_machine, 
                 st = M.fintime[curr_machine]
                )
        # schedule the operation  
        M._schedule(job_name = job_name, 
                   mach = curr_machine, 
                    proc_time = op_proc_time,
                   st = M.fintime[curr_machine])
        
        # update the key variable 
        curr_new_value = J.get_LST()[curr_job_index]
        if J.completion_times[curr_job_index] > J.due_dates[curr_job_index]:
            print(f'[INFO] Job {curr_job_index+1} is already tardy, postpone it.')
            curr_new_value = float('inf')
        # update the LST value and push it back to Q if the job has its second operation that hasn't been done
        if curr_op == 0:
            heappush(Q, (curr_new_value, curr_job_index))
            # it maintains the heap invariant, no need to heapify

In [575]:
heuristic(J, M)
J.is_completed

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [576]:
M.span

[[6.4, 3.1, 0.5, 5.5, 0.2, 0.9, 2.6],
 [6.8, 6.6, 6.5, 0.4],
 [9.6, 6.2],
 [4.8, 3.5, 4.3, 4.9],
 [9.6, 5.0, 4.8],
 [7.6, 1.0, 3.5, 6.9],
 [9.0, 8.7, 5.0],
 [5.3, 1.7, 0.5, 5.8, 6.6],
 [8.9, 1.2, 5.4, 2.6, 0.6, 0.6999999999999957, 0.6]]

In [577]:
J.completion_times

array([22.7,  9.6, 10.1,  8.9,  8.6, 14.6, 15.7, 16.6, 20.3, 19.2, 18.1,
       13.3, 12.6, 17.7, 20. , 18.7, 19. ,  7. , 10. , 13.4])

In [578]:
np.where(J.completion_times > J.due_dates)

(array([ 0,  2,  5,  6,  7, 10, 12, 14, 15, 16]),)

In [579]:
Tardy_jobs = list(np.where(J.completion_times > J.due_dates)[0])
Makespan = max(M.fintime)
print(f'Instance {i+1}:')
print('First objective (# tardy):', len(Tardy_jobs), Tardy_jobs)
print('Second objective (makespan):', Makespan)

Instance 12:
First objective (# tardy): 10 [0, 2, 5, 6, 7, 10, 12, 14, 15, 16]
Second objective (makespan): 22.7


In [580]:
M.fintime

[19.2, 20.299999999999997, 15.8, 17.5, 19.4, 19.0, 22.7, 19.9, 20.0]

In [581]:
# 類似Gantt Chart
# () is 1-indexed job name, the attached value is its associated proc time
# idle is the gap appears to preserve precedence between the operations within the same job 
print(*M.schedule, sep = '\n')

['(5, 1) 6.40', '(19, 1) 3.10', '(19, 2) 0.50', '(8, 1) 5.50', '(7, 2) 0.20', '(8, 2) 0.90', '(10, 2) 2.60']
['(20, 1) 6.80', '(20, 2) 6.60', 'idle 6.50', '(9, 2) 0.40']
['(2, 1) 9.60', '(16, 1) 6.20']
['(3, 1) 4.80', '(6, 1) 3.50', '(13, 2) 4.30', '(1, 1) 4.90']
['(7, 1) 9.60', '(6, 2) 5.00', '(15, 1) 4.80']
['(13, 1) 7.60', '(5, 2) 1.00', '(11, 1) 3.50', '(17, 2) 6.90']
['(17, 1) 9.00', '(14, 1) 8.70', '(1, 2) 5.00']
['(18, 1) 5.30', '(18, 2) 1.70', '(12, 1) 0.50', '(12, 2) 5.80', '(9, 1) 6.60']
['(4, 1) 8.90', '(3, 2) 1.20', '(10, 1) 5.40', '(11, 2) 2.60', '(16, 2) 0.60', 'idle 0.70', '(15, 2) 0.60']


## Testing all instances

In [586]:
Results = []
for instidx in range(5):
    data = instances[instidx]
    M = Machines(data)
    J = Jobs(len(data))
    J.add_jobs(data)
    heuristic(J = J, M = M)
    print(f'[INFO] Summary \ninstance {instidx}:')
    Tardy_jobs = list(np.where(J.completion_times > J.due_dates)[0])
    Tardy_jobs = [x+1 for x in Tardy_jobs]
    Makespan = max(M.fintime)
    print(f'Number of jobs: {len(data)}')
    print('First objective (# tardy):', len(Tardy_jobs), Tardy_jobs)
    print('Second objective (makespan):', Makespan)
    print('==================================')
    Results.append({'J':J, 'M':M})

[INFO] Summary 
instance 0:
Number of jobs: 12
First objective (# tardy): 0 []
Second objective (makespan): 8.2
[INFO] Job 7 is already tardy, postpone it.
[INFO] Summary 
instance 1:
Number of jobs: 11
First objective (# tardy): 1 [7]
Second objective (makespan): 8.7
[INFO] Job 3 is already tardy, postpone it.
[INFO] Job 6 is already tardy, postpone it.
[INFO] Job 5 is already tardy, postpone it.
[INFO] Job 4 is already tardy, postpone it.
[INFO] Summary 
instance 2:
Number of jobs: 10
First objective (# tardy): 4 [3, 4, 5, 6]
Second objective (makespan): 9.0
[INFO] Job 11 is already tardy, postpone it.
[INFO] Job 15 is already tardy, postpone it.
[INFO] Job 2 is already tardy, postpone it.
[INFO] Job 10 is already tardy, postpone it.
[INFO] Job 6 is already tardy, postpone it.
[INFO] Job 3 is already tardy, postpone it.
[INFO] Job 12 is already tardy, postpone it.
[INFO] Job 4 is already tardy, postpone it.
[INFO] Job 14 is already tardy, postpone it.
[INFO] Job 7 is already tardy, p

In [583]:
print(*Results[4]['M'].schedule, sep = '\n')

['(5, 1) 6.40', '(19, 1) 3.10', '(19, 2) 0.50', '(8, 1) 5.50', '(7, 2) 0.20', '(8, 2) 0.90', '(10, 2) 2.60']
['(20, 1) 6.80', '(20, 2) 6.60', 'idle 6.50', '(9, 2) 0.40']
['(2, 1) 9.60', '(16, 1) 6.20']
['(3, 1) 4.80', '(6, 1) 3.50', '(13, 2) 4.30', '(1, 1) 4.90']
['(7, 1) 9.60', '(6, 2) 5.00', '(15, 1) 4.80']
['(13, 1) 7.60', '(5, 2) 1.00', '(11, 1) 3.50', '(17, 2) 6.90']
['(17, 1) 9.00', '(14, 1) 8.70', '(1, 2) 5.00']
['(18, 1) 5.30', '(18, 2) 1.70', '(12, 1) 0.50', '(12, 2) 5.80', '(9, 1) 6.60']
['(4, 1) 8.90', '(3, 2) 1.20', '(10, 1) 5.40', '(11, 2) 2.60', '(16, 2) 0.60', 'idle 0.70', '(15, 2) 0.60']


In [584]:
print(*Results[4]['J'].due_dates, sep = ', ')
print('====')
print(*Results[4]['J'].completion_times, sep = ', ')

21.9, 10.1, 8.9, 10.9, 9.0, 14.4, 14.5, 16.1, 20.4, 19.2, 14.9, 15.2, 12.0, 17.7, 19.0, 18.1, 18.4, 7.7, 10.0, 13.4
====
22.7, 9.6, 10.1, 8.9, 8.6, 14.6, 15.7, 16.599999999999998, 20.299999999999997, 19.2, 18.1, 13.3, 12.600000000000001, 17.7, 20.0, 18.700000000000003, 19.0, 7.0, 10.0, 13.399999999999999
