diff --git a/src/supervisor2.erl b/src/supervisor2.erl index 0044eda719..7b9421eb3e 100644 --- a/src/supervisor2.erl +++ b/src/supervisor2.erl @@ -1,14 +1,18 @@ -%% This file is a copy of supervisor.erl from the R13B-3 Erlang/OTP +%% This file is a copy of supervisor.erl from the R16B Erlang/OTP %% distribution, with the following modifications: %% %% 1) the module name is supervisor2 %% -%% 2) there is a new strategy called -%% simple_one_for_one_terminate. This is exactly the same as for -%% simple_one_for_one, except that children *are* explicitly -%% terminated as per the shutdown component of the child_spec. +%% 2) a find_child/2 utility function has been added %% -%% 3) child specifications can contain, as the restart type, a tuple +%% 3) Added an 'intrinsic' restart type. Like the transient type, this +%% type means the child should only be restarted if the child exits +%% abnormally. Unlike the transient type, if the child exits +%% normally, the supervisor itself also exits normally. If the +%% child is a supervisor and it exits normally (i.e. with reason of +%% 'shutdown') then the child's parent also exits normally. +%% +%% 4) child specifications can contain, as the restart type, a tuple %% {permanent, Delay} | {transient, Delay} | {intrinsic, Delay} %% where Delay >= 0 (see point (4) below for intrinsic). The delay, %% in seconds, indicates what should happen if a child, upon being @@ -19,17 +23,9 @@ %% stopping the supervisor, the supervisor instead continues and %% tries to start up the child again, Delay seconds later. %% -%% Note that you can never restart more frequently than the MaxT -%% and MaxR parameters allow: i.e. you must wait until *both* the -%% Delay has passed *and* the MaxT and MaxR parameters allow the -%% child to be restarted. -%% -%% Also note that the Delay is a *minimum*. There is no guarantee -%% that the child will be restarted within that time, especially if -%% other processes are dying and being restarted at the same time - -%% essentially we have to wait for the delay to have passed and for -%% the MaxT and MaxR parameters to permit the child to be -%% restarted. This may require waiting for longer than Delay. +%% Note that if a child is delay-restarted this will reset the +%% count of restarts towrds MaxR and MaxT. This matters if MaxT > +%% Delay, since otherwise we would fail to restart after the delay. %% %% Sometimes, you may wish for a transient or intrinsic child to %% exit abnormally so that it gets restarted, but still log @@ -41,21 +37,14 @@ %% perspective it's a normal exit, whilst from supervisor's %% perspective, it's an abnormal exit. %% -%% 4) Added an 'intrinsic' restart type. Like the transient type, this -%% type means the child should only be restarted if the child exits -%% abnormally. Unlike the transient type, if the child exits -%% normally, the supervisor itself also exits normally. If the -%% child is a supervisor and it exits normally (i.e. with reason of -%% 'shutdown') then the child's parent also exits normally. -%% %% 5) normal, and {shutdown, _} exit reasons are all treated the same %% (i.e. are regarded as normal exits) %% -%% All modifications are (C) 2010-2012 VMware, Inc. +%% All modifications are (C) 2010-2013 GoPivotal, Inc. %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% Copyright Ericsson AB 1996-2012. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -75,61 +64,34 @@ -behaviour(gen_server). %% External exports --export([start_link/2,start_link/3, +-export([start_link/2, start_link/3, start_child/2, restart_child/2, delete_child/2, terminate_child/2, - which_children/1, find_child/2, - check_childspecs/1]). + which_children/1, count_children/1, + find_child/2, check_childspecs/1]). %% Internal exports --export([init/1, handle_call/3, handle_info/2, terminate/2, code_change/3]). --export([handle_cast/2]). - --define(DICT, dict). - --record(state, {name, - strategy, - children = [], - dynamics = ?DICT:new(), - intensity, - period, - restarts = [], - module, - args}). - --record(child, {pid = undefined, % pid is undefined when child is not running - name, - mfa, - restart_type, - shutdown, - child_type, - modules = []}). - --define(is_simple(State), State#state.strategy =:= simple_one_for_one orelse - State#state.strategy =:= simple_one_for_one_terminate). --define(is_terminate_simple(State), - State#state.strategy =:= simple_one_for_one_terminate). - --ifdef(use_specs). +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, + terminate/2, code_change/3]). +-export([try_again_restart/3]). %%-------------------------------------------------------------------------- -%% Types -%%-------------------------------------------------------------------------- - +-ifdef(use_specs). -export_type([child_spec/0, startchild_ret/0, strategy/0, sup_name/0]). +-endif. +%%-------------------------------------------------------------------------- --type child() :: 'undefined' | pid(). +-ifdef(use_specs). +-type child() :: 'undefined' | pid(). -type child_id() :: term(). --type mfargs() :: {M :: module(), F :: atom(), A :: [term()] | undefined}. --type modules() :: [module()] | 'dynamic'. --type delay() :: non_neg_integer(). --type restart() :: 'permanent' | 'transient' | 'temporary' | 'intrinsic' - | {'permanent', delay()} | {'transient', delay()} - | {'intrinsic', delay()}. +-type mfargs() :: {M :: module(), F :: atom(), A :: [term()] | undefined}. +-type modules() :: [module()] | 'dynamic'. +-type delay() :: non_neg_integer(). +-type restart() :: 'permanent' | 'transient' | 'temporary' | 'intrinsic' | {'permanent', delay()} | {'transient', delay()} | {'intrinsic', delay()}. -type shutdown() :: 'brutal_kill' | timeout(). --type worker() :: 'worker' | 'supervisor'. +-type worker() :: 'worker' | 'supervisor'. -type sup_name() :: {'local', Name :: atom()} | {'global', Name :: atom()}. --type sup_ref() :: (Name :: atom()) +-type sup_ref() :: (Name :: atom()) | {Name :: atom(), Node :: node()} | {'global', Name :: atom()} | pid(). @@ -140,103 +102,75 @@ Type :: worker(), Modules :: modules()}. - -type strategy() :: 'one_for_all' | 'one_for_one' - | 'rest_for_one' | 'simple_one_for_one' - | 'simple_one_for_one_terminate'. - --type child_rec() :: #child{pid :: child() | {restarting,pid()} | [pid()], - name :: child_id(), - mfa :: mfargs(), - restart_type :: restart(), - shutdown :: shutdown(), - child_type :: worker(), - modules :: modules()}. - --type state() :: #state{strategy :: strategy(), - children :: [child_rec()], - dynamics :: ?DICT(), - intensity :: non_neg_integer(), - period :: pos_integer()}. - -%%-------------------------------------------------------------------------- -%% Callback behaviour -%%-------------------------------------------------------------------------- - --callback init(Args :: term()) -> - {ok, {{RestartStrategy :: strategy(), - MaxR :: non_neg_integer(), - MaxT :: non_neg_integer()}, - [ChildSpec :: child_spec()]}} - | ignore. + | 'rest_for_one' | 'simple_one_for_one'. +-endif. %%-------------------------------------------------------------------------- -%% Specs -%%-------------------------------------------------------------------------- - --type startchild_err() :: 'already_present' - | {'already_started', Child :: child()} | term(). --type startchild_ret() :: {'ok', Child :: child()} - | {'ok', Child :: child(), Info :: term()} - | {'error', startchild_err()}. - --spec start_child(SupRef, ChildSpec) -> startchild_ret() when - SupRef :: sup_ref(), - ChildSpec :: child_spec() | (List :: [term()]). - --spec restart_child(SupRef, Id) -> Result when - SupRef :: sup_ref(), - Id :: child_id(), - Result :: {'ok', Child :: child()} - | {'ok', Child :: child(), Info :: term()} - | {'error', Error}, - Error :: 'running' | 'not_found' | 'simple_one_for_one' | term(). - --spec delete_child(SupRef, Id) -> Result when - SupRef :: sup_ref(), - Id :: child_id(), - Result :: 'ok' | {'error', Error}, - Error :: 'running' | 'not_found' | 'simple_one_for_one'. - --spec terminate_child(SupRef, Id) -> Result when - SupRef :: sup_ref(), - Id :: pid() | child_id(), - Result :: 'ok' | {'error', Error}, - Error :: 'not_found' | 'simple_one_for_one'. - --spec which_children(SupRef) -> [{Id,Child,Type,Modules}] when - SupRef :: sup_ref(), - Id :: child_id() | 'undefined', - Child :: child(), - Type :: worker(), - Modules :: modules(). - --spec check_childspecs(ChildSpecs) -> Result when - ChildSpecs :: [child_spec()], - Result :: 'ok' | {'error', Error :: term()}. --type init_sup_name() :: sup_name() | 'self'. - --type stop_rsn() :: 'shutdown' | {'bad_return', {module(),'init', term()}} - | {'bad_start_spec', term()} | {'start_spec', term()} - | {'supervisor_data', term()}. - --spec init({init_sup_name(), module(), [term()]}) -> - {'ok', state()} | 'ignore' | {'stop', stop_rsn()}. - --type call() :: 'which_children'. --spec handle_call(call(), term(), state()) -> {'reply', term(), state()}. - --spec handle_cast('null', state()) -> {'noreply', state()}. +-ifdef(use_specs). +-record(child, {% pid is undefined when child is not running + pid = undefined :: child() | {restarting,pid()} | [pid()], + name :: child_id(), + mfargs :: mfargs(), + restart_type :: restart(), + shutdown :: shutdown(), + child_type :: worker(), + modules = [] :: modules()}). +-type child_rec() :: #child{}. +-else. +-record(child, { + pid = undefined, + name, + mfargs, + restart_type, + shutdown, + child_type, + modules = []}). +-endif. --spec handle_info(term(), state()) -> - {'noreply', state()} | {'stop', 'shutdown', state()}. +-define(DICT, dict). +-define(SETS, sets). +-define(SET, set). --spec terminate(term(), state()) -> 'ok'. +-ifdef(use_specs). +-record(state, {name, + strategy :: strategy(), + children = [] :: [child_rec()], + dynamics :: ?DICT:?DICT() | ?SETS:?SET(), + intensity :: non_neg_integer(), + period :: pos_integer(), + restarts = [], + module, + args}). +-type state() :: #state{}. +-else. +-record(state, {name, + strategy, + children = [], + dynamics, + intensity, + period, + restarts = [], + module, + args}). +-endif. --spec code_change(term(), state(), term()) -> - {'ok', state()} | {'error', term()}. +-define(is_simple(State), State#state.strategy =:= simple_one_for_one). +-define(is_permanent(R), ((R =:= permanent) orelse + (is_tuple(R) andalso + tuple_size(R) == 2 andalso + element(1, R) =:= permanent))). +-define(is_explicit_restart(R), + R == {shutdown, restart}). +-ifdef(use_specs). +-callback init(Args :: term()) -> + {ok, {{RestartStrategy :: strategy(), + MaxR :: non_neg_integer(), + MaxT :: non_neg_integer()}, + [ChildSpec :: child_spec()]}} + | ignore. -else. -export([behaviour_info/1]). @@ -247,27 +181,73 @@ behaviour_info(_Other) -> undefined. -endif. +-define(restarting(_Pid_), {restarting,_Pid_}). %%% --------------------------------------------------- %%% This is a general process supervisor built upon gen_server.erl. %%% Servers/processes should/could also be built using gen_server.erl. %%% SupName = {local, atom()} | {global, atom()}. %%% --------------------------------------------------- +-ifdef(use_specs). +-type startlink_err() :: {'already_started', pid()} + | {'shutdown', term()} + | term(). +-type startlink_ret() :: {'ok', pid()} | 'ignore' | {'error', startlink_err()}. + +-spec start_link(Module, Args) -> startlink_ret() when + Module :: module(), + Args :: term(). + +-endif. start_link(Mod, Args) -> gen_server:start_link(?MODULE, {self, Mod, Args}, []). +-ifdef(use_specs). +-spec start_link(SupName, Module, Args) -> startlink_ret() when + SupName :: sup_name(), + Module :: module(), + Args :: term(). +-endif. start_link(SupName, Mod, Args) -> gen_server:start_link(SupName, ?MODULE, {SupName, Mod, Args}, []). %%% --------------------------------------------------- %%% Interface functions. %%% --------------------------------------------------- +-ifdef(use_specs). +-type startchild_err() :: 'already_present' + | {'already_started', Child :: child()} | term(). +-type startchild_ret() :: {'ok', Child :: child()} + | {'ok', Child :: child(), Info :: term()} + | {'error', startchild_err()}. + +-spec start_child(SupRef, ChildSpec) -> startchild_ret() when + SupRef :: sup_ref(), + ChildSpec :: child_spec() | (List :: [term()]). +-endif. start_child(Supervisor, ChildSpec) -> call(Supervisor, {start_child, ChildSpec}). +-ifdef(use_specs). +-spec restart_child(SupRef, Id) -> Result when + SupRef :: sup_ref(), + Id :: child_id(), + Result :: {'ok', Child :: child()} + | {'ok', Child :: child(), Info :: term()} + | {'error', Error}, + Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one' | + term(). +-endif. restart_child(Supervisor, Name) -> call(Supervisor, {restart_child, Name}). +-ifdef(use_specs). +-spec delete_child(SupRef, Id) -> Result when + SupRef :: sup_ref(), + Id :: child_id(), + Result :: 'ok' | {'error', Error}, + Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one'. +-endif. delete_child(Supervisor, Name) -> call(Supervisor, {delete_child, Name}). @@ -277,12 +257,44 @@ delete_child(Supervisor, Name) -> %% Note that the child is *always* terminated in some %% way (maybe killed). %%----------------------------------------------------------------- +-ifdef(use_specs). +-spec terminate_child(SupRef, Id) -> Result when + SupRef :: sup_ref(), + Id :: pid() | child_id(), + Result :: 'ok' | {'error', Error}, + Error :: 'not_found' | 'simple_one_for_one'. +-endif. terminate_child(Supervisor, Name) -> call(Supervisor, {terminate_child, Name}). +-ifdef(use_specs). +-spec which_children(SupRef) -> [{Id,Child,Type,Modules}] when + SupRef :: sup_ref(), + Id :: child_id() | undefined, + Child :: child() | 'restarting', + Type :: worker(), + Modules :: modules(). +-endif. which_children(Supervisor) -> call(Supervisor, which_children). +-ifdef(use_specs). +-spec count_children(SupRef) -> PropListOfCounts when + SupRef :: sup_ref(), + PropListOfCounts :: [Count], + Count :: {specs, ChildSpecCount :: non_neg_integer()} + | {active, ActiveProcessCount :: non_neg_integer()} + | {supervisors, ChildSupervisorCount :: non_neg_integer()} + |{workers, ChildWorkerCount :: non_neg_integer()}. +-endif. +count_children(Supervisor) -> + call(Supervisor, count_children). + +-ifdef(use_specs). +-spec find_child(Supervisor, Name) -> [pid()] when + Supervisor :: sup_ref(), + Name :: child_id(). +-endif. find_child(Supervisor, Name) -> [Pid || {Name1, Pid, _Type, _Modules} <- which_children(Supervisor), Name1 =:= Name]. @@ -290,6 +302,11 @@ find_child(Supervisor, Name) -> call(Supervisor, Req) -> gen_server:call(Supervisor, Req, infinity). +-ifdef(use_specs). +-spec check_childspecs(ChildSpecs) -> Result when + ChildSpecs :: [child_spec()], + Result :: 'ok' | {'error', Error :: term()}. +-endif. check_childspecs(ChildSpecs) when is_list(ChildSpecs) -> case check_startspec(ChildSpecs) of {ok, _} -> ok; @@ -297,11 +314,37 @@ check_childspecs(ChildSpecs) when is_list(ChildSpecs) -> end; check_childspecs(X) -> {error, {badarg, X}}. +%%%----------------------------------------------------------------- +%%% Called by timer:apply_after from restart/2 +-ifdef(use_specs). +-spec try_again_restart(SupRef, Child, Reason) -> ok when + SupRef :: sup_ref(), + Child :: child_id() | pid(), + Reason :: term(). +-endif. +try_again_restart(Supervisor, Child, Reason) -> + cast(Supervisor, {try_again_restart, Child, Reason}). + +cast(Supervisor, Req) -> + gen_server:cast(Supervisor, Req). + %%% --------------------------------------------------- %%% %%% Initialize the supervisor. %%% %%% --------------------------------------------------- +-ifdef(use_specs). +-type init_sup_name() :: sup_name() | 'self'. + +-type stop_rsn() :: {'shutdown', term()} + | {'bad_return', {module(),'init', term()}} + | {'bad_start_spec', term()} + | {'start_spec', term()} + | {'supervisor_data', term()}. + +-spec init({init_sup_name(), module(), [term()]}) -> + {'ok', state()} | 'ignore' | {'stop', stop_rsn()}. +-endif. init({SupName, Mod, Args}) -> process_flag(trap_exit, true), case Mod:init(Args) of @@ -319,7 +362,7 @@ init({SupName, Mod, Args}) -> Error -> {stop, {bad_return, {Mod, init, Error}}} end. - + init_children(State, StartSpec) -> SupName = State#state.name, case check_startspec(StartSpec) of @@ -327,9 +370,9 @@ init_children(State, StartSpec) -> case start_children(Children, SupName) of {ok, NChildren} -> {ok, State#state{children = NChildren}}; - {error, NChildren} -> + {error, NChildren, Reason} -> terminate_children(NChildren, SupName), - {stop, shutdown} + {stop, {shutdown, Reason}} end; Error -> {stop, {start_spec, Error}} @@ -347,32 +390,35 @@ init_dynamic(_State, StartSpec) -> %%----------------------------------------------------------------- %% Func: start_children/2 -%% Args: Children = [#child] in start order -%% SupName = {local, atom()} | {global, atom()} | {pid(),Mod} -%% Purpose: Start all children. The new list contains #child's +%% Args: Children = [child_rec()] in start order +%% SupName = {local, atom()} | {global, atom()} | {pid(), Mod} +%% Purpose: Start all children. The new list contains #child's %% with pids. -%% Returns: {ok, NChildren} | {error, NChildren} -%% NChildren = [#child] in termination order (reversed +%% Returns: {ok, NChildren} | {error, NChildren, Reason} +%% NChildren = [child_rec()] in termination order (reversed %% start order) %%----------------------------------------------------------------- start_children(Children, SupName) -> start_children(Children, [], SupName). start_children([Child|Chs], NChildren, SupName) -> case do_start_child(SupName, Child) of + {ok, undefined} when Child#child.restart_type =:= temporary -> + start_children(Chs, NChildren, SupName); {ok, Pid} -> start_children(Chs, [Child#child{pid = Pid}|NChildren], SupName); {ok, Pid, _Extra} -> start_children(Chs, [Child#child{pid = Pid}|NChildren], SupName); {error, Reason} -> report_error(start_error, Reason, Child, SupName), - {error, lists:reverse(Chs) ++ [Child | NChildren]} + {error, lists:reverse(Chs) ++ [Child | NChildren], + {failed_to_start_child,Child#child.name,Reason}} end; start_children([], NChildren, _SupName) -> {ok, NChildren}. do_start_child(SupName, Child) -> - #child{mfa = {M, F, A}} = Child, - case catch apply(M, F, A) of + #child{mfargs = {M, F, Args}} = Child, + case catch apply(M, F, Args) of {ok, Pid} when is_pid(Pid) -> NChild = Child#child{pid = Pid}, report_progress(NChild, SupName), @@ -381,7 +427,7 @@ do_start_child(SupName, Child) -> NChild = Child#child{pid = Pid}, report_progress(NChild, SupName), {ok, Pid, Extra}; - ignore -> + ignore -> {ok, undefined}; {error, What} -> {error, What}; What -> {error, What} @@ -400,36 +446,55 @@ do_start_child_i(M, F, A) -> What -> {error, What} end. - %%% --------------------------------------------------- %%% %%% Callback functions. %%% %%% --------------------------------------------------- +-ifdef(use_specs). +-type call() :: 'which_children' | 'count_children' | {_, _}. % XXX: refine +-spec handle_call(call(), term(), state()) -> {'reply', term(), state()}. +-endif. handle_call({start_child, EArgs}, _From, State) when ?is_simple(State) -> - #child{mfa = {M, F, A}} = hd(State#state.children), + Child = hd(State#state.children), + #child{mfargs = {M, F, A}} = Child, Args = A ++ EArgs, case do_start_child_i(M, F, Args) of - {ok, undefined} -> - {reply, {ok, undefined}, State}; + {ok, undefined} when Child#child.restart_type =:= temporary -> + {reply, {ok, undefined}, State}; {ok, Pid} -> - NState = State#state{dynamics = - ?DICT:store(Pid, Args, State#state.dynamics)}, + NState = save_dynamic_child(Child#child.restart_type, Pid, Args, State), {reply, {ok, Pid}, NState}; {ok, Pid, Extra} -> - NState = State#state{dynamics = - ?DICT:store(Pid, Args, State#state.dynamics)}, + NState = save_dynamic_child(Child#child.restart_type, Pid, Args, State), {reply, {ok, Pid, Extra}, NState}; What -> {reply, What, State} end; -%%% The requests terminate_child, delete_child and restart_child are -%%% invalid for simple_one_for_one and simple_one_for_one_terminate -%%% supervisors. +%% terminate_child for simple_one_for_one can only be done with pid +handle_call({terminate_child, Name}, _From, State) when not is_pid(Name), + ?is_simple(State) -> + {reply, {error, simple_one_for_one}, State}; + +handle_call({terminate_child, Name}, _From, State) -> + case get_child(Name, State, ?is_simple(State)) of + {value, Child} -> + case do_terminate(Child, State#state.name) of + #child{restart_type=RT} when RT=:=temporary; ?is_simple(State) -> + {reply, ok, state_del_child(Child, State)}; + NChild -> + {reply, ok, replace_child(NChild, State)} + end; + false -> + {reply, {error, not_found}, State} + end; + +%%% The requests delete_child and restart_child are invalid for +%%% simple_one_for_one supervisors. handle_call({_Req, _Data}, _From, State) when ?is_simple(State) -> - {reply, {error, State#state.strategy}, State}; + {reply, {error, simple_one_for_one}, State}; handle_call({start_child, ChildSpec}, _From, State) -> case check_childspec(ChildSpec) of @@ -453,6 +518,8 @@ handle_call({restart_child, Name}, _From, State) -> Error -> {reply, Error, State} end; + {value, #child{pid=?restarting(_)}} -> + {reply, {error, restarting}, State}; {value, _} -> {reply, {error, running}, State}; _ -> @@ -464,60 +531,146 @@ handle_call({delete_child, Name}, _From, State) -> {value, Child} when Child#child.pid =:= undefined -> NState = remove_child(Child, State), {reply, ok, NState}; + {value, #child{pid=?restarting(_)}} -> + {reply, {error, restarting}, State}; {value, _} -> {reply, {error, running}, State}; _ -> {reply, {error, not_found}, State} end; -handle_call({terminate_child, Name}, _From, State) -> - case get_child(Name, State) of - {value, Child} -> - NChild = do_terminate(Child, State#state.name), - {reply, ok, replace_child(NChild, State)}; - _ -> - {reply, {error, not_found}, State} - end; +handle_call(which_children, _From, #state{children = [#child{restart_type = temporary, + child_type = CT, + modules = Mods}]} = + State) when ?is_simple(State) -> + Reply = lists:map(fun(Pid) -> {undefined, Pid, CT, Mods} end, + ?SETS:to_list(dynamics_db(temporary, State#state.dynamics))), + {reply, Reply, State}; -handle_call(which_children, _From, State) when ?is_simple(State) -> - [#child{child_type = CT, modules = Mods}] = State#state.children, - Reply = lists:map(fun ({Pid, _}) -> {undefined, Pid, CT, Mods} end, - ?DICT:to_list(State#state.dynamics)), +handle_call(which_children, _From, #state{children = [#child{restart_type = RType, + child_type = CT, + modules = Mods}]} = + State) when ?is_simple(State) -> + Reply = lists:map(fun({?restarting(_),_}) -> {undefined,restarting,CT,Mods}; + ({Pid, _}) -> {undefined, Pid, CT, Mods} end, + ?DICT:to_list(dynamics_db(RType, State#state.dynamics))), {reply, Reply, State}; handle_call(which_children, _From, State) -> Resp = - lists:map(fun (#child{pid = Pid, name = Name, + lists:map(fun(#child{pid = ?restarting(_), name = Name, + child_type = ChildType, modules = Mods}) -> + {Name, restarting, ChildType, Mods}; + (#child{pid = Pid, name = Name, child_type = ChildType, modules = Mods}) -> - {Name, Pid, ChildType, Mods} + {Name, Pid, ChildType, Mods} end, State#state.children), - {reply, Resp, State}. + {reply, Resp, State}; -%%% Hopefully cause a function-clause as there is no API function -%%% that utilizes cast. -handle_cast(null, State) -> - error_logger:error_msg("ERROR: Supervisor received cast-message 'null'~n", - []), - {noreply, State}. +handle_call(count_children, _From, #state{children = [#child{restart_type = temporary, + child_type = CT}]} = State) + when ?is_simple(State) -> + {Active, Count} = + ?SETS:fold(fun(Pid, {Alive, Tot}) -> + case is_pid(Pid) andalso is_process_alive(Pid) of + true ->{Alive+1, Tot +1}; + false -> + {Alive, Tot + 1} + end + end, {0, 0}, dynamics_db(temporary, State#state.dynamics)), + Reply = case CT of + supervisor -> [{specs, 1}, {active, Active}, + {supervisors, Count}, {workers, 0}]; + worker -> [{specs, 1}, {active, Active}, + {supervisors, 0}, {workers, Count}] + end, + {reply, Reply, State}; -handle_info({delayed_restart, {RestartType, Reason, Child}}, State) +handle_call(count_children, _From, #state{children = [#child{restart_type = RType, + child_type = CT}]} = State) when ?is_simple(State) -> - {ok, NState} = do_restart(RestartType, Reason, Child, State), - {noreply, NState}; -handle_info({delayed_restart, {RestartType, Reason, Child}}, State) -> - case get_child(Child#child.name, State) of - {value, Child1} -> - {ok, NState} = do_restart(RestartType, Reason, Child1, State), - {noreply, NState}; - _ -> + {Active, Count} = + ?DICT:fold(fun(Pid, _Val, {Alive, Tot}) -> + case is_pid(Pid) andalso is_process_alive(Pid) of + true -> + {Alive+1, Tot +1}; + false -> + {Alive, Tot + 1} + end + end, {0, 0}, dynamics_db(RType, State#state.dynamics)), + Reply = case CT of + supervisor -> [{specs, 1}, {active, Active}, + {supervisors, Count}, {workers, 0}]; + worker -> [{specs, 1}, {active, Active}, + {supervisors, 0}, {workers, Count}] + end, + {reply, Reply, State}; + +handle_call(count_children, _From, State) -> + %% Specs and children are together on the children list... + {Specs, Active, Supers, Workers} = + lists:foldl(fun(Child, Counts) -> + count_child(Child, Counts) + end, {0,0,0,0}, State#state.children), + + %% Reformat counts to a property list. + Reply = [{specs, Specs}, {active, Active}, + {supervisors, Supers}, {workers, Workers}], + {reply, Reply, State}. + + +count_child(#child{pid = Pid, child_type = worker}, + {Specs, Active, Supers, Workers}) -> + case is_pid(Pid) andalso is_process_alive(Pid) of + true -> {Specs+1, Active+1, Supers, Workers+1}; + false -> {Specs+1, Active, Supers, Workers+1} + end; +count_child(#child{pid = Pid, child_type = supervisor}, + {Specs, Active, Supers, Workers}) -> + case is_pid(Pid) andalso is_process_alive(Pid) of + true -> {Specs+1, Active+1, Supers+1, Workers}; + false -> {Specs+1, Active, Supers+1, Workers} + end. + + +%%% If a restart attempt failed, this message is sent via +%%% timer:apply_after(0,...) in order to give gen_server the chance to +%%% check it's inbox before trying again. +-ifdef(use_specs). +-spec handle_cast({try_again_restart, child_id() | pid(), term()}, state()) -> + {'noreply', state()} | {stop, shutdown, state()}. +-endif. +handle_cast({try_again_restart,Pid,Reason}, #state{children=[Child]}=State) + when ?is_simple(State) -> + RT = Child#child.restart_type, + RPid = restarting(Pid), + case dynamic_child_args(RPid, dynamics_db(RT, State#state.dynamics)) of + {ok, Args} -> + {M, F, _} = Child#child.mfargs, + NChild = Child#child{pid = RPid, mfargs = {M, F, Args}}, + try_restart(Child#child.restart_type, Reason, NChild, State); + error -> {noreply, State} end; +handle_cast({try_again_restart,Name,Reason}, State) -> + %% we still support >= R12-B3 in which lists:keyfind/3 doesn't exist + case lists:keysearch(Name,#child.name,State#state.children) of + {value, Child = #child{pid=?restarting(_), restart_type=RestartType}} -> + try_restart(RestartType, Reason, Child, State); + _ -> + {noreply,State} + end. + %% %% Take care of terminated children. %% +-ifdef(use_specs). +-spec handle_info(term(), state()) -> + {'noreply', state()} | {'stop', 'shutdown', state()}. +-endif. handle_info({'EXIT', Pid, Reason}, State) -> case restart_child(Pid, Reason, State) of {ok, State1} -> @@ -526,20 +679,41 @@ handle_info({'EXIT', Pid, Reason}, State) -> {stop, shutdown, State1} end; +handle_info({delayed_restart, {RestartType, Reason, Child}}, State) + when ?is_simple(State) -> + try_restart(RestartType, Reason, Child, State#state{restarts = []}); %% [1] +handle_info({delayed_restart, {RestartType, Reason, Child}}, State) -> + case get_child(Child#child.name, State) of + {value, Child1} -> + try_restart(RestartType, Reason, Child1, + State#state{restarts = []}); %% [1] + _What -> + {noreply, State} + end; +%% [1] When we receive a delayed_restart message we want to reset the +%% restarts field since otherwise the MaxT might not have elapsed and +%% we would just delay again and again. Since a common use of the +%% delayed restart feature is for MaxR = 1, MaxT = some huge number +%% (so that we don't end up bouncing around in non-delayed restarts) +%% this is important. + handle_info(Msg, State) -> error_logger:error_msg("Supervisor received unexpected message: ~p~n", [Msg]), {noreply, State}. + %% %% Terminate this server. %% -terminate(_Reason, State) when ?is_terminate_simple(State) -> - terminate_simple_children( - hd(State#state.children), State#state.dynamics, State#state.name), - ok; +-ifdef(use_specs). +-spec terminate(term(), state()) -> 'ok'. +-endif. +terminate(_Reason, #state{children=[Child]} = State) when ?is_simple(State) -> + terminate_dynamic_children(Child, dynamics_db(Child#child.restart_type, + State#state.dynamics), + State#state.name); terminate(_Reason, State) -> - terminate_children(State#state.children, State#state.name), - ok. + terminate_children(State#state.children, State#state.name). %% %% Change code for the supervisor. @@ -550,6 +724,10 @@ terminate(_Reason, State) -> %% NOTE: This requires that the init function of the call-back module %% does not have any side effects. %% +-ifdef(use_specs). +-spec code_change(term(), state(), term()) -> + {'ok', state()} | {'error', term()}. +-endif. code_change(_, State, _) -> case (State#state.module):init(State#state.args) of {ok, {SupFlags, StartSpec}} -> @@ -577,14 +755,13 @@ check_flags({Strategy, MaxIntensity, Period}) -> check_flags(What) -> {bad_flags, What}. -update_childspec(State, StartSpec) when ?is_simple(State) -> - case check_startspec(StartSpec) of - {ok, [Child]} -> - {ok, State#state{children = [Child]}}; - Error -> - {error, Error} - end; - +update_childspec(State, StartSpec) when ?is_simple(State) -> + case check_startspec(StartSpec) of + {ok, [Child]} -> + {ok, State#state{children = [Child]}}; + Error -> + {error, Error} + end; update_childspec(State, StartSpec) -> case check_startspec(StartSpec) of {ok, Children} -> @@ -603,11 +780,11 @@ update_childspec1([Child|OldC], Children, KeepOld) -> update_childspec1(OldC, Children, [Child|KeepOld]) end; update_childspec1([], Children, KeepOld) -> - % Return them in (keeped) reverse start order. - lists:reverse(Children ++ KeepOld). + %% Return them in (kept) reverse start order. + lists:reverse(Children ++ KeepOld). update_chsp(OldCh, Children) -> - case lists:map(fun (Ch) when OldCh#child.name =:= Ch#child.name -> + case lists:map(fun(Ch) when OldCh#child.name =:= Ch#child.name -> Ch#child{pid = OldCh#child.pid}; (Ch) -> Ch @@ -627,20 +804,16 @@ handle_start_child(Child, State) -> case get_child(Child#child.name, State) of false -> case do_start_child(State#state.name, Child) of + {ok, undefined} when Child#child.restart_type =:= temporary -> + {{ok, undefined}, State}; {ok, Pid} -> - Children = State#state.children, - {{ok, Pid}, - State#state{children = - [Child#child{pid = Pid}|Children]}}; + {{ok, Pid}, save_child(Child#child{pid = Pid}, State)}; {ok, Pid, Extra} -> - Children = State#state.children, - {{ok, Pid, Extra}, - State#state{children = - [Child#child{pid = Pid}|Children]}}; + {{ok, Pid, Extra}, save_child(Child#child{pid = Pid}, State)}; {error, What} -> {{error, {What, Child}}, State} end; - {value, OldChild} when OldChild#child.pid =/= undefined -> + {value, OldChild} when is_pid(OldChild#child.pid) -> {{error, {already_started, OldChild#child.pid}}, State}; {value, _OldChild} -> {{error, already_present}, State} @@ -648,127 +821,145 @@ handle_start_child(Child, State) -> %%% --------------------------------------------------- %%% Restart. A process has terminated. -%%% Returns: {ok, #state} | {shutdown, #state} +%%% Returns: {ok, state()} | {shutdown, state()} %%% --------------------------------------------------- -restart_child(Pid, Reason, State) when ?is_simple(State) -> - case ?DICT:find(Pid, State#state.dynamics) of +restart_child(Pid, Reason, #state{children = [Child]} = State) when ?is_simple(State) -> + RestartType = Child#child.restart_type, + case dynamic_child_args(Pid, dynamics_db(RestartType, State#state.dynamics)) of {ok, Args} -> - [Child] = State#state.children, - RestartType = Child#child.restart_type, - {M, F, _} = Child#child.mfa, - NChild = Child#child{pid = Pid, mfa = {M, F, Args}}, + {M, F, _} = Child#child.mfargs, + NChild = Child#child{pid = Pid, mfargs = {M, F, Args}}, do_restart(RestartType, Reason, NChild, State); error -> - {ok, State} + {ok, State} end; + restart_child(Pid, Reason, State) -> Children = State#state.children, + %% we still support >= R12-B3 in which lists:keyfind/3 doesn't exist case lists:keysearch(Pid, #child.pid, Children) of - {value, Child} -> - RestartType = Child#child.restart_type, + {value, #child{restart_type = RestartType} = Child} -> do_restart(RestartType, Reason, Child, State); - _ -> + false -> {ok, State} end. -do_restart({permanent = RestartType, Delay}, Reason, Child, State) -> - do_restart_delay({RestartType, Delay}, Reason, Child, State); -do_restart(permanent, Reason, Child, State) -> - report_error(child_terminated, Reason, Child, State#state.name), - restart(Child, State); -do_restart(Type, normal, Child, State) -> - del_child_and_maybe_shutdown(Type, Child, State); -do_restart({RestartType, Delay}, {shutdown, restart} = Reason, Child, State) - when RestartType =:= transient orelse RestartType =:= intrinsic -> - do_restart_delay({RestartType, Delay}, Reason, Child, State); -do_restart(Type, {shutdown, _}, Child, State) -> - del_child_and_maybe_shutdown(Type, Child, State); -do_restart(Type, shutdown, Child = #child{child_type = supervisor}, State) -> - del_child_and_maybe_shutdown(Type, Child, State); -do_restart({RestartType, Delay}, Reason, Child, State) - when RestartType =:= transient orelse RestartType =:= intrinsic -> - do_restart_delay({RestartType, Delay}, Reason, Child, State); -do_restart(Type, Reason, Child, State) when Type =:= transient orelse - Type =:= intrinsic -> - report_error(child_terminated, Reason, Child, State#state.name), +try_restart(RestartType, Reason, Child, State) -> + case handle_restart(RestartType, Reason, Child, State) of + {ok, NState} -> {noreply, NState}; + {shutdown, State2} -> {stop, shutdown, State2} + end. + +do_restart(RestartType, Reason, Child, State) -> + maybe_report_error(RestartType, Reason, Child, State), + handle_restart(RestartType, Reason, Child, State). + +maybe_report_error(permanent, Reason, Child, State) -> + report_child_termination(Reason, Child, State); +maybe_report_error({permanent, _}, Reason, Child, State) -> + report_child_termination(Reason, Child, State); +maybe_report_error(_Type, Reason, Child, State) -> + case is_abnormal_termination(Reason) of + true -> report_child_termination(Reason, Child, State); + false -> ok + end. + +report_child_termination(Reason, Child, State) -> + report_error(child_terminated, Reason, Child, State#state.name). + +handle_restart(permanent, _Reason, Child, State) -> restart(Child, State); -do_restart(temporary, Reason, Child, State) -> - report_error(child_terminated, Reason, Child, State#state.name), - NState = state_del_child(Child, State), - {ok, NState}. +handle_restart(transient, Reason, Child, State) -> + restart_if_explicit_or_abnormal(fun restart/2, + fun delete_child_and_continue/2, + Reason, Child, State); +handle_restart(intrinsic, Reason, Child, State) -> + restart_if_explicit_or_abnormal(fun restart/2, + fun delete_child_and_stop/2, + Reason, Child, State); +handle_restart(temporary, _Reason, Child, State) -> + delete_child_and_continue(Child, State); +handle_restart({permanent, _Delay}=Restart, Reason, Child, State) -> + do_restart_delay(Restart, Reason, Child, State); +handle_restart({transient, _Delay}=Restart, Reason, Child, State) -> + restart_if_explicit_or_abnormal(defer_to_restart_delay(Restart, Reason), + fun delete_child_and_continue/2, + Reason, Child, State); +handle_restart({intrinsic, _Delay}=Restart, Reason, Child, State) -> + restart_if_explicit_or_abnormal(defer_to_restart_delay(Restart, Reason), + fun delete_child_and_stop/2, + Reason, Child, State). + +restart_if_explicit_or_abnormal(RestartHow, Otherwise, Reason, Child, State) -> + case ?is_explicit_restart(Reason) orelse is_abnormal_termination(Reason) of + true -> RestartHow(Child, State); + false -> Otherwise(Child, State) + end. + +defer_to_restart_delay(Restart, Reason) -> + fun(Child, State) -> do_restart_delay(Restart, Reason, Child, State) end. + +delete_child_and_continue(Child, State) -> + {ok, state_del_child(Child, State)}. + +delete_child_and_stop(Child, State) -> + {shutdown, state_del_child(Child, State)}. + +is_abnormal_termination(normal) -> false; +is_abnormal_termination(shutdown) -> false; +is_abnormal_termination({shutdown, _}) -> false; +is_abnormal_termination(_Other) -> true. do_restart_delay({RestartType, Delay}, Reason, Child, State) -> - case restart1(Child, State) of + case add_restart(State) of {ok, NState} -> - {ok, NState}; - {terminate, NState} -> - Wait = case Delay of - 0 -> - %% Delay of 0 indicates to wait as little - %% as possible. That's until next restart - %% "expires" - [EarliestTS | _] = lists:reverse(State#state.restarts), - {EarliestMegaS, EarliestS, _} = EarliestTS, - %% NOTE: difference() in restart1 ignores - %% micros, so earliest time when oldest - %% restart expires can be constructed as - %% follows - ExpireTS = {EarliestMegaS, EarliestS + State#state.period + 1, 0}, - %% and in case it's already "expired", we - %% shouldn't allow wait to be negative - erlang:max(0, (timer:now_diff(ExpireTS, erlang:now()) + 999) div 1000); - _ -> - trunc(Delay * 1000) - end, - %% when we've already waited enough, there's no need to - %% wait another full Delay. Just enough to be able to - %% "fit" extra restart into our window. So if our next - %% restart attempt fails we'll use delay of 0 - _TRef = erlang:send_after(Wait, self(), + maybe_restart(NState#state.strategy, Child, NState); + {terminate, _NState} -> + %% we've reached the max restart intensity, but the + %% add_restart will have added to the restarts + %% field. Given we don't want to die here, we need to go + %% back to the old restarts field otherwise we'll never + %% attempt to restart later, which is why we ignore + %% NState for this clause. + _TRef = erlang:send_after(trunc(Delay*1000), self(), {delayed_restart, - {{RestartType, 0}, Reason, Child}}), - {ok, state_del_child(Child, NState)} + {{RestartType, Delay}, Reason, Child}}), + {ok, state_del_child(Child, State)} end. -del_child_and_maybe_shutdown(intrinsic, Child, State) -> - {shutdown, state_del_child(Child, State)}; -del_child_and_maybe_shutdown({intrinsic, _Delay}, Child, State) -> - {shutdown, state_del_child(Child, State)}; -del_child_and_maybe_shutdown(_, Child, State) -> - {ok, state_del_child(Child, State)}. - restart(Child, State) -> case add_restart(State) of {ok, NState} -> - restart(NState#state.strategy, Child, NState, fun restart/2); + maybe_restart(NState#state.strategy, Child, NState); {terminate, NState} -> report_error(shutdown, reached_max_restart_intensity, Child, State#state.name), - {shutdown, state_del_child(Child, NState)} + {shutdown, remove_child(Child, NState)} end. -restart1(Child, State) -> - case add_restart(State) of - {ok, NState} -> - restart(NState#state.strategy, Child, NState, fun restart1/2); - {terminate, _NState} -> - %% we've reached the max restart intensity, but the - %% add_restart will have added to the restarts - %% field. Given we don't want to die here, we need to go - %% back to the old restarts field otherwise we'll never - %% attempt to restart later. - {terminate, State} +maybe_restart(Strategy, Child, State) -> + case restart(Strategy, Child, State) of + {try_again, Reason, NState2} -> + %% Leaving control back to gen_server before + %% trying again. This way other incoming requsts + %% for the supervisor can be handled - e.g. a + %% shutdown request for the supervisor or the + %% child. + Id = if ?is_simple(State) -> Child#child.pid; + true -> Child#child.name + end, + timer:apply_after(0,?MODULE,try_again_restart,[self(),Id,Reason]), + {ok,NState2}; + Other -> + Other end. -restart(Strategy, Child, State, Restart) - when Strategy =:= simple_one_for_one orelse - Strategy =:= simple_one_for_one_terminate -> - #child{mfa = {M, F, A}} = Child, - Dynamics = ?DICT:erase(Child#child.pid, State#state.dynamics), +restart(simple_one_for_one, Child, State) -> + #child{pid = OldPid, mfargs = {M, F, A}} = Child, + Dynamics = ?DICT:erase(OldPid, dynamics_db(Child#child.restart_type, + State#state.dynamics)), case do_start_child_i(M, F, A) of - {ok, undefined} -> - {ok, State}; {ok, Pid} -> NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)}, {ok, NState}; @@ -776,10 +967,13 @@ restart(Strategy, Child, State, Restart) NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)}, {ok, NState}; {error, Error} -> + NState = State#state{dynamics = ?DICT:store(restarting(OldPid), A, + Dynamics)}, report_error(start_error, Error, Child, State#state.name), - Restart(Child, State) + {try_again, Error, NState} end; -restart(one_for_one, Child, State, Restart) -> +restart(one_for_one, Child, State) -> + OldPid = Child#child.pid, case do_start_child(State#state.name, Child) of {ok, Pid} -> NState = replace_child(Child#child{pid = Pid}, State), @@ -788,123 +982,71 @@ restart(one_for_one, Child, State, Restart) -> NState = replace_child(Child#child{pid = Pid}, State), {ok, NState}; {error, Reason} -> + NState = replace_child(Child#child{pid = restarting(OldPid)}, State), report_error(start_error, Reason, Child, State#state.name), - Restart(Child, State) + {try_again, Reason, NState} end; -restart(rest_for_one, Child, State, Restart) -> +restart(rest_for_one, Child, State) -> {ChAfter, ChBefore} = split_child(Child#child.pid, State#state.children), ChAfter2 = terminate_children(ChAfter, State#state.name), case start_children(ChAfter2, State#state.name) of {ok, ChAfter3} -> {ok, State#state{children = ChAfter3 ++ ChBefore}}; - {error, ChAfter3} -> - Restart(Child, State#state{children = ChAfter3 ++ ChBefore}) + {error, ChAfter3, Reason} -> + NChild = Child#child{pid=restarting(Child#child.pid)}, + NState = State#state{children = ChAfter3 ++ ChBefore}, + {try_again, Reason, replace_child(NChild,NState)} end; -restart(one_for_all, Child, State, Restart) -> +restart(one_for_all, Child, State) -> Children1 = del_child(Child#child.pid, State#state.children), Children2 = terminate_children(Children1, State#state.name), case start_children(Children2, State#state.name) of {ok, NChs} -> {ok, State#state{children = NChs}}; - {error, NChs} -> - Restart(Child, State#state{children = NChs}) + {error, NChs, Reason} -> + NChild = Child#child{pid=restarting(Child#child.pid)}, + NState = State#state{children = NChs}, + {try_again, Reason, replace_child(NChild,NState)} end. +restarting(Pid) when is_pid(Pid) -> ?restarting(Pid); +restarting(RPid) -> RPid. + %%----------------------------------------------------------------- %% Func: terminate_children/2 -%% Args: Children = [#child] in termination order +%% Args: Children = [child_rec()] in termination order %% SupName = {local, atom()} | {global, atom()} | {pid(),Mod} -%% Returns: NChildren = [#child] in +%% Returns: NChildren = [child_rec()] in %% startup order (reversed termination order) %%----------------------------------------------------------------- terminate_children(Children, SupName) -> terminate_children(Children, SupName, []). +%% Temporary children should not be restarted and thus should +%% be skipped when building the list of terminated children, although +%% we do want them to be shut down as many functions from this module +%% use this function to just clear everything. +terminate_children([Child = #child{restart_type=temporary} | Children], SupName, Res) -> + do_terminate(Child, SupName), + terminate_children(Children, SupName, Res); terminate_children([Child | Children], SupName, Res) -> NChild = do_terminate(Child, SupName), terminate_children(Children, SupName, [NChild | Res]); terminate_children([], _SupName, Res) -> Res. -terminate_simple_children(Child, Dynamics, SupName) -> - Pids = dict:fold(fun (Pid, _Args, Pids) -> - erlang:monitor(process, Pid), - unlink(Pid), - exit(Pid, child_exit_reason(Child)), - [Pid | Pids] - end, [], Dynamics), - TimeoutMsg = {timeout, make_ref()}, - TRef = timeout_start(Child, TimeoutMsg), - {Replies, Timedout} = - lists:foldl( - fun (_Pid, {Replies, Timedout}) -> - {Reply, Timedout1} = - receive - TimeoutMsg -> - Remaining = Pids -- [P || {P, _} <- Replies], - [exit(P, kill) || P <- Remaining], - receive {'DOWN', _MRef, process, Pid, Reason} -> - {{error, Reason}, true} - end; - {'DOWN', _MRef, process, Pid, Reason} -> - {child_res(Child, Reason, Timedout), Timedout}; - {'EXIT', Pid, Reason} -> - receive {'DOWN', _MRef, process, Pid, _} -> - {{error, Reason}, Timedout} - end - end, - {[{Pid, Reply} | Replies], Timedout1} - end, {[], false}, Pids), - timeout_stop(Child, TRef, TimeoutMsg, Timedout), - ReportError = shutdown_error_reporter(SupName), - [case Reply of - {_Pid, ok} -> ok; - {Pid, {error, R}} -> ReportError(R, Child#child{pid = Pid}) - end || Reply <- Replies], - ok. - -child_exit_reason(#child{shutdown = brutal_kill}) -> kill; -child_exit_reason(#child{}) -> shutdown. - -child_res(#child{shutdown=brutal_kill}, killed, false) -> ok; -child_res(#child{}, shutdown, false) -> ok; -child_res(#child{restart_type=permanent}, normal, false) -> {error, normal}; -child_res(#child{restart_type={permanent,_}},normal, false) -> {error, normal}; -child_res(#child{}, normal, false) -> ok; -child_res(#child{}, R, _) -> {error, R}. - -timeout_start(#child{shutdown = Time}, Msg) when is_integer(Time) -> - erlang:send_after(Time, self(), Msg); -timeout_start(#child{}, _Msg) -> - ok. - -timeout_stop(#child{shutdown = Time}, TRef, Msg, false) when is_integer(Time) -> - erlang:cancel_timer(TRef), - receive - Msg -> ok - after - 0 -> ok - end; -timeout_stop(#child{}, ok, _Msg, _Timedout) -> - ok. - -do_terminate(Child, SupName) when Child#child.pid =/= undefined -> - ReportError = shutdown_error_reporter(SupName), +do_terminate(Child, SupName) when is_pid(Child#child.pid) -> case shutdown(Child#child.pid, Child#child.shutdown) of ok -> ok; - {error, normal} -> - case Child#child.restart_type of - permanent -> ReportError(normal, Child); - {permanent, _Delay} -> ReportError(normal, Child); - _ -> ok - end; + {error, normal} when not ?is_permanent(Child#child.restart_type) -> + ok; {error, OtherReason} -> - ReportError(OtherReason, Child) + report_error(shutdown_error, OtherReason, Child, SupName) end, Child#child{pid = undefined}; do_terminate(Child, _SupName) -> - Child. + Child#child{pid = undefined}. %%----------------------------------------------------------------- %% Shutdowns a child. We must check the EXIT value @@ -917,7 +1059,6 @@ do_terminate(Child, _SupName) -> %% Returns: ok | {error, OtherReason} (this should be reported) %%----------------------------------------------------------------- shutdown(Pid, brutal_kill) -> - case monitor_child(Pid) of ok -> exit(Pid, kill), @@ -930,9 +1071,7 @@ shutdown(Pid, brutal_kill) -> {error, Reason} -> {error, Reason} end; - shutdown(Pid, Time) -> - case monitor_child(Pid) of ok -> exit(Pid, shutdown), %% Try to shutdown gracefully @@ -979,20 +1118,163 @@ monitor_child(Pid) -> %% that will be handled in shutdown/2. ok end. - - + + +%%----------------------------------------------------------------- +%% Func: terminate_dynamic_children/3 +%% Args: Child = child_rec() +%% Dynamics = ?DICT() | ?SET() +%% SupName = {local, atom()} | {global, atom()} | {pid(),Mod} +%% Returns: ok +%% +%% +%% Shutdown all dynamic children. This happens when the supervisor is +%% stopped. Because the supervisor can have millions of dynamic children, we +%% can have an significative overhead here. +%%----------------------------------------------------------------- +terminate_dynamic_children(Child, Dynamics, SupName) -> + {Pids, EStack0} = monitor_dynamic_children(Child, Dynamics), + Sz = ?SETS:size(Pids), + EStack = case Child#child.shutdown of + brutal_kill -> + ?SETS:fold(fun(P, _) -> exit(P, kill) end, ok, Pids), + wait_dynamic_children(Child, Pids, Sz, undefined, EStack0); + infinity -> + ?SETS:fold(fun(P, _) -> exit(P, shutdown) end, ok, Pids), + wait_dynamic_children(Child, Pids, Sz, undefined, EStack0); + Time -> + ?SETS:fold(fun(P, _) -> exit(P, shutdown) end, ok, Pids), + TRef = erlang:start_timer(Time, self(), kill), + wait_dynamic_children(Child, Pids, Sz, TRef, EStack0) + end, + %% Unroll stacked errors and report them + ?DICT:fold(fun(Reason, Ls, _) -> + report_error(shutdown_error, Reason, + Child#child{pid=Ls}, SupName) + end, ok, EStack). + + +monitor_dynamic_children(#child{restart_type=temporary}, Dynamics) -> + ?SETS:fold(fun(P, {Pids, EStack}) -> + case monitor_child(P) of + ok -> + {?SETS:add_element(P, Pids), EStack}; + {error, normal} -> + {Pids, EStack}; + {error, Reason} -> + {Pids, ?DICT:append(Reason, P, EStack)} + end + end, {?SETS:new(), ?DICT:new()}, Dynamics); +monitor_dynamic_children(#child{restart_type=RType}, Dynamics) -> + ?DICT:fold(fun(P, _, {Pids, EStack}) when is_pid(P) -> + case monitor_child(P) of + ok -> + {?SETS:add_element(P, Pids), EStack}; + {error, normal} when not ?is_permanent(RType) -> + {Pids, EStack}; + {error, Reason} -> + {Pids, ?DICT:append(Reason, P, EStack)} + end; + (?restarting(_), _, {Pids, EStack}) -> + {Pids, EStack} + end, {?SETS:new(), ?DICT:new()}, Dynamics). + +wait_dynamic_children(_Child, _Pids, 0, undefined, EStack) -> + EStack; +wait_dynamic_children(_Child, _Pids, 0, TRef, EStack) -> + %% If the timer has expired before its cancellation, we must empty the + %% mail-box of the 'timeout'-message. + erlang:cancel_timer(TRef), + receive + {timeout, TRef, kill} -> + EStack + after 0 -> + EStack + end; +wait_dynamic_children(#child{shutdown=brutal_kill} = Child, Pids, Sz, + TRef, EStack) -> + receive + {'DOWN', _MRef, process, Pid, killed} -> + wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1, + TRef, EStack); + + {'DOWN', _MRef, process, Pid, Reason} -> + wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1, + TRef, ?DICT:append(Reason, Pid, EStack)) + end; +wait_dynamic_children(#child{restart_type=RType} = Child, Pids, Sz, + TRef, EStack) -> + receive + {'DOWN', _MRef, process, Pid, shutdown} -> + wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1, + TRef, EStack); + + {'DOWN', _MRef, process, Pid, normal} when not ?is_permanent(RType) -> + wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1, + TRef, EStack); + + {'DOWN', _MRef, process, Pid, Reason} -> + wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1, + TRef, ?DICT:append(Reason, Pid, EStack)); + + {timeout, TRef, kill} -> + ?SETS:fold(fun(P, _) -> exit(P, kill) end, ok, Pids), + wait_dynamic_children(Child, Pids, Sz-1, undefined, EStack) + end. + %%----------------------------------------------------------------- %% Child/State manipulating functions. %%----------------------------------------------------------------- -state_del_child(#child{pid = Pid}, State) when ?is_simple(State) -> - NDynamics = ?DICT:erase(Pid, State#state.dynamics), + +%% Note we do not want to save the parameter list for temporary processes as +%% they will not be restarted, and hence we do not need this information. +%% Especially for dynamic children to simple_one_for_one supervisors +%% it could become very costly as it is not uncommon to spawn +%% very many such processes. +save_child(#child{restart_type = temporary, + mfargs = {M, F, _}} = Child, #state{children = Children} = State) -> + State#state{children = [Child#child{mfargs = {M, F, undefined}} |Children]}; +save_child(Child, #state{children = Children} = State) -> + State#state{children = [Child |Children]}. + +save_dynamic_child(temporary, Pid, _, #state{dynamics = Dynamics} = State) -> + State#state{dynamics = ?SETS:add_element(Pid, dynamics_db(temporary, Dynamics))}; +save_dynamic_child(RestartType, Pid, Args, #state{dynamics = Dynamics} = State) -> + State#state{dynamics = ?DICT:store(Pid, Args, dynamics_db(RestartType, Dynamics))}. + +dynamics_db(temporary, undefined) -> + ?SETS:new(); +dynamics_db(_, undefined) -> + ?DICT:new(); +dynamics_db(_,Dynamics) -> + Dynamics. + +dynamic_child_args(Pid, Dynamics) -> + case ?SETS:is_set(Dynamics) of + true -> + {ok, undefined}; + false -> + ?DICT:find(Pid, Dynamics) + end. + +state_del_child(#child{pid = Pid, restart_type = temporary}, State) when ?is_simple(State) -> + NDynamics = ?SETS:del_element(Pid, dynamics_db(temporary, State#state.dynamics)), + State#state{dynamics = NDynamics}; +state_del_child(#child{pid = Pid, restart_type = RType}, State) when ?is_simple(State) -> + NDynamics = ?DICT:erase(Pid, dynamics_db(RType, State#state.dynamics)), State#state{dynamics = NDynamics}; state_del_child(Child, State) -> NChildren = del_child(Child#child.name, State#state.children), State#state{children = NChildren}. +del_child(Name, [Ch=#child{pid = ?restarting(_)}|_]=Chs) when Ch#child.name =:= Name -> + Chs; +del_child(Name, [Ch|Chs]) when Ch#child.name =:= Name, Ch#child.restart_type =:= temporary -> + Chs; del_child(Name, [Ch|Chs]) when Ch#child.name =:= Name -> [Ch#child{pid = undefined} | Chs]; +del_child(Pid, [Ch|Chs]) when Ch#child.pid =:= Pid, Ch#child.restart_type =:= temporary -> + Chs; del_child(Pid, [Ch|Chs]) when Ch#child.pid =:= Pid -> [Ch#child{pid = undefined} | Chs]; del_child(Name, [Ch|Chs]) -> @@ -1015,7 +1297,38 @@ split_child(_, [], After) -> {lists:reverse(After), []}. get_child(Name, State) -> + get_child(Name, State, false). +get_child(Pid, State, AllowPid) when AllowPid, is_pid(Pid) -> + get_dynamic_child(Pid, State); +get_child(Name, State, _) -> lists:keysearch(Name, #child.name, State#state.children). + +get_dynamic_child(Pid, #state{children=[Child], dynamics=Dynamics}) -> + DynamicsDb = dynamics_db(Child#child.restart_type, Dynamics), + case is_dynamic_pid(Pid, DynamicsDb) of + true -> + {value, Child#child{pid=Pid}}; + false -> + RPid = restarting(Pid), + case is_dynamic_pid(RPid, DynamicsDb) of + true -> + {value, Child#child{pid=RPid}}; + false -> + case erlang:is_process_alive(Pid) of + true -> false; + false -> {value, Child} + end + end + end. + +is_dynamic_pid(Pid, Dynamics) -> + case ?SETS:is_set(Dynamics) of + true -> + ?SETS:is_element(Pid, Dynamics); + false -> + ?DICT:is_key(Pid, Dynamics) + end. + replace_child(Child, State) -> Chs = do_replace_child(Child, State#state.children), State#state{children = Chs}. @@ -1034,13 +1347,13 @@ remove_child(Child, State) -> %% Args: SupName = {local, atom()} | {global, atom()} | self %% Type = {Strategy, MaxIntensity, Period} %% Strategy = one_for_one | one_for_all | simple_one_for_one | -%% rest_for_one -%% MaxIntensity = integer() -%% Period = integer() +%% rest_for_one +%% MaxIntensity = integer() >= 0 +%% Period = integer() > 0 %% Mod :== atom() -%% Arsg :== term() +%% Args :== term() %% Purpose: Check that Type is of correct type (!) -%% Returns: {ok, #state} | Error +%% Returns: {ok, state()} | Error %%----------------------------------------------------------------- init_state(SupName, Type, Mod, Args) -> case catch init_state1(SupName, Type, Mod, Args) of @@ -1055,46 +1368,45 @@ init_state1(SupName, {Strategy, MaxIntensity, Period}, Mod, Args) -> validIntensity(MaxIntensity), validPeriod(Period), {ok, #state{name = supname(SupName,Mod), - strategy = Strategy, - intensity = MaxIntensity, - period = Period, - module = Mod, - args = Args}}; + strategy = Strategy, + intensity = MaxIntensity, + period = Period, + module = Mod, + args = Args}}; init_state1(_SupName, Type, _, _) -> {invalid_type, Type}. -validStrategy(simple_one_for_one_terminate) -> true; -validStrategy(simple_one_for_one) -> true; -validStrategy(one_for_one) -> true; -validStrategy(one_for_all) -> true; -validStrategy(rest_for_one) -> true; -validStrategy(What) -> throw({invalid_strategy, What}). +validStrategy(simple_one_for_one) -> true; +validStrategy(one_for_one) -> true; +validStrategy(one_for_all) -> true; +validStrategy(rest_for_one) -> true; +validStrategy(What) -> throw({invalid_strategy, What}). validIntensity(Max) when is_integer(Max), Max >= 0 -> true; -validIntensity(What) -> throw({invalid_intensity, What}). +validIntensity(What) -> throw({invalid_intensity, What}). validPeriod(Period) when is_integer(Period), Period > 0 -> true; validPeriod(What) -> throw({invalid_period, What}). -supname(self,Mod) -> {self(),Mod}; -supname(N,_) -> N. +supname(self, Mod) -> {self(), Mod}; +supname(N, _) -> N. %%% ------------------------------------------------------ %%% Check that the children start specification is valid. %%% Shall be a six (6) tuple %%% {Name, Func, RestartType, Shutdown, ChildType, Modules} %%% where Name is an atom -%%% Func is {Mod, Fun, Args} == {atom, atom, list} +%%% Func is {Mod, Fun, Args} == {atom(), atom(), list()} %%% RestartType is permanent | temporary | transient | %%% intrinsic | {permanent, Delay} | %%% {transient, Delay} | {intrinsic, Delay} %% where Delay >= 0 -%%% Shutdown = integer() | infinity | brutal_kill +%%% Shutdown = integer() > 0 | infinity | brutal_kill %%% ChildType = supervisor | worker %%% Modules = [atom()] | dynamic -%%% Returns: {ok, [#child]} | Error +%%% Returns: {ok, [child_rec()]} | Error %%% ------------------------------------------------------ check_startspec(Children) -> check_startspec(Children, []). @@ -1122,14 +1434,14 @@ check_childspec(Name, Func, RestartType, Shutdown, ChildType, Mods) -> validChildType(ChildType), validShutdown(Shutdown, ChildType), validMods(Mods), - {ok, #child{name = Name, mfa = Func, restart_type = RestartType, + {ok, #child{name = Name, mfargs = Func, restart_type = RestartType, shutdown = Shutdown, child_type = ChildType, modules = Mods}}. validChildType(supervisor) -> true; validChildType(worker) -> true; validChildType(What) -> throw({invalid_child_type, What}). -validName(_Name) -> true. +validName(_Name) -> true. validFunc({M, F, A}) when is_atom(M), is_atom(F), @@ -1152,13 +1464,13 @@ validDelay(What) -> throw({invalid_delay, What}). validShutdown(Shutdown, _) when is_integer(Shutdown), Shutdown > 0 -> true; -validShutdown(infinity, supervisor) -> true; +validShutdown(infinity, _) -> true; validShutdown(brutal_kill, _) -> true; validShutdown(Shutdown, _) -> throw({invalid_shutdown, Shutdown}). validMods(dynamic) -> true; validMods(Mods) when is_list(Mods) -> - lists:foreach(fun (Mod) -> + lists:foreach(fun(Mod) -> if is_atom(Mod) -> ok; true -> throw({invalid_module, Mod}) @@ -1232,15 +1544,18 @@ report_error(Error, Reason, Child, SupName) -> {offender, extract_child(Child)}], error_logger:error_report(supervisor_report, ErrorMsg). -shutdown_error_reporter(SupName) -> - fun(Reason, Child) -> - report_error(shutdown_error, Reason, Child, SupName) - end. +extract_child(Child) when is_list(Child#child.pid) -> + [{nb_children, length(Child#child.pid)}, + {name, Child#child.name}, + {mfargs, Child#child.mfargs}, + {restart_type, Child#child.restart_type}, + {shutdown, Child#child.shutdown}, + {child_type, Child#child.child_type}]; extract_child(Child) -> [{pid, Child#child.pid}, {name, Child#child.name}, - {mfa, Child#child.mfa}, + {mfargs, Child#child.mfargs}, {restart_type, Child#child.restart_type}, {shutdown, Child#child.shutdown}, {child_type, Child#child.child_type}]. diff --git a/test/supervisor2_tests.erl b/test/supervisor2_tests.erl new file mode 100644 index 0000000000..199c66eca0 --- /dev/null +++ b/test/supervisor2_tests.erl @@ -0,0 +1,75 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is GoPivotal, Inc. +%% Copyright (c) 2011-2015 Pivotal Software, Inc. All rights reserved. +%% + +-module(supervisor2_tests). +-behaviour(supervisor2). + +-export([test_all/0, start_link/0]). +-export([init/1]). + +test_all() -> + ok = check_shutdown(stop, 200, 200, 2000), + ok = check_shutdown(ignored, 1, 2, 2000). + +check_shutdown(SigStop, Iterations, ChildCount, SupTimeout) -> + {ok, Sup} = supervisor2:start_link(?MODULE, [SupTimeout]), + Res = lists:foldl( + fun (I, ok) -> + TestSupPid = erlang:whereis(?MODULE), + ChildPids = + [begin + {ok, ChildPid} = + supervisor2:start_child(TestSupPid, []), + ChildPid + end || _ <- lists:seq(1, ChildCount)], + MRef = erlang:monitor(process, TestSupPid), + [P ! SigStop || P <- ChildPids], + ok = supervisor2:terminate_child(Sup, test_sup), + {ok, _} = supervisor2:restart_child(Sup, test_sup), + receive + {'DOWN', MRef, process, TestSupPid, shutdown} -> + ok; + {'DOWN', MRef, process, TestSupPid, Reason} -> + {error, {I, Reason}} + end; + (_, R) -> + R + end, ok, lists:seq(1, Iterations)), + unlink(Sup), + MSupRef = erlang:monitor(process, Sup), + exit(Sup, shutdown), + receive + {'DOWN', MSupRef, process, Sup, _Reason} -> + ok + end, + Res. + +start_link() -> + Pid = spawn_link(fun () -> + process_flag(trap_exit, true), + receive stop -> ok end + end), + {ok, Pid}. + +init([Timeout]) -> + {ok, {{one_for_one, 0, 1}, + [{test_sup, {supervisor2, start_link, + [{local, ?MODULE}, ?MODULE, []]}, + transient, Timeout, supervisor, [?MODULE]}]}}; +init([]) -> + {ok, {{simple_one_for_one, 0, 1}, + [{test_worker, {?MODULE, start_link, []}, + temporary, 1000, worker, [?MODULE]}]}}. diff --git a/test/the_supervisor2_tests.erl b/test/the_supervisor2_tests.erl index a8540e00f5..91720a8e0a 100644 --- a/test/the_supervisor2_tests.erl +++ b/test/the_supervisor2_tests.erl @@ -71,6 +71,11 @@ frequent_crashes_run() -> ?assertNotEqual(undefined, find_child(Sup, c1)), + %% delayed restarts reset the restart counters; this makes it two; the + %% next shutdown should delay the restart again + sync_shutdown_child(find_child(Sup, c2), die), + ?assertNotEqual(undefined, find_child(Sup, c2)), + sync_shutdown_child(find_child(Sup, c1), die1), sync_shutdown_child(find_child(Sup, c2), die), ?assertEqual(undefined, find_child(Sup, c2)),