Deadlock during long-haul stress test #79

cgillum · 2022-04-20T05:59:13Z

The long-haul stress test running on v1.0.0-rc shows various deadlocks in the warning logs. Looking at the local SQL Server, all the deadlocks seem to have the same cause. Here is an example:

<deadlock>
 <victim-list>
  <victimProcess id="process1f720001088" />
 </victim-list>
 <process-list>
  <process id="process1f720001088" taskpriority="0" logused="0" waitresource="PAGE: 5:1:250251 " waittime="104" ownerId="33084415" transactionname="user_transaction" lasttranstarted="2022-04-19T21:54:03.373" XDES="0x1f7200a8428" lockMode="U" schedulerid="5" kpid="34052" status="suspended" spid="83" sbid="0" ecid="0" priority="0" trancount="2" lastbatchstarted="2022-04-19T21:54:03.373" lastbatchcompleted="2022-04-19T21:54:03.377" lastattention="1900-01-01T00:00:00.377" clientapp="TestHubName" hostname="{MACHINE_NAME}" hostpid="16064" loginname="{username}" isolationlevel="read committed (2)" xactid="33084415" currentdb="5" currentdbname="DurableDB" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="DurableDB.dt._LockNextOrchestration" line="25" stmtstart="1784" stmtend="3092" sqlhandle="0x03000500f01ab84a4ef3a30078ae000001000000000000000000000000000000000000000000000000000000">
UPDATE TOP (1) Instances WITH (READPAST)
    SET
        [LockedBy] = @LockedBy,
	    [LockExpiration] = @LockExpiration,
        @instanceID = I.[InstanceID],
        @parentInstanceID = I.[ParentInstanceID],
        @version = I.[Version]
    FROM 
        dt.Instances I WITH (READPAST) INNER JOIN NewEvents E WITH (READPAST) ON
            E.[TaskHub] = @TaskHub AND
            E.[InstanceID] = I.[InstanceID]
    WHERE
        I.TaskHub = @TaskHub AND
        I.[RuntimeStatus] IN ('Pending', 'Running') AND
	    (I.[LockExpiration] IS NULL OR I.[LockExpiration] &lt; @now) AND
        (E.[VisibleTime] IS NULL OR E.[VisibleTime] &lt; @now    </frame>
   </executionStack>
   <inputbuf>
Proc [Database Id = 5 Object Id = 1253579504]   </inputbuf>
  </process>
  <process id="process1f713fb4108" taskpriority="0" logused="1392" waitresource="KEY: 5:72057594043826176 (aa52126f61e5)" waittime="104" ownerId="33084446" transactionname="user_transaction" lasttranstarted="2022-04-19T21:54:03.380" XDES="0x1f70df48428" lockMode="X" schedulerid="8" kpid="53204" status="suspended" spid="51" sbid="0" ecid="0" priority="0" trancount="2" lastbatchstarted="2022-04-19T21:54:03.380" lastbatchcompleted="2022-04-19T21:54:03.377" lastattention="1900-01-01T00:00:00.377" clientapp="TestHubName" hostname="{MACHINE_NAME}" hostpid="16064" loginname="{username}" isolationlevel="read committed (2)" xactid="33084446" currentdb="5" currentdbname="DurableDB" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="DurableDB.dt._CheckpointOrchestration" line="219" stmtstart="16706" stmtend="17362" sqlhandle="0x03000500293fac4b51f3a30078ae000001000000000000000000000000000000000000000000000000000000">
DELETE E
    OUTPUT DELETED.InstanceID, DELETED.SequenceNumber
    FROM dt.NewEvents E WITH (FORCESEEK(PK_NewEvents(TaskHub, InstanceID, SequenceNumber)))
        INNER JOIN @DeletedEvents D ON 
            D.InstanceID = E.InstanceID AND
            D.SequenceNumber = E.SequenceNumber AND
            E.TaskHub = @TaskHu    </frame>
   </executionStack>
   <inputbuf>
Proc [Database Id = 5 Object Id = 1269579561]   </inputbuf>
  </process>
 </process-list>
 <resource-list>
  <pagelock fileid="1" pageid="250251" dbid="5" subresource="FULL" objectname="DurableDB.dt.Instances" id="lock1f717da2b00" mode="IX" associatedObjectId="72057594043629568">
   <owner-list>
    <owner id="process1f713fb4108" mode="IX" />
   </owner-list>
   <waiter-list>
    <waiter id="process1f720001088" mode="U" requestType="wait" />
   </waiter-list>
  </pagelock>
  <keylock hobtid="72057594043826176" dbid="5" objectname="DurableDB.dt.NewEvents" indexname="PK_NewEvents" id="lock1f70fd18f00" mode="U" associatedObjectId="72057594043826176">
   <owner-list>
    <owner id="process1f720001088" mode="S" />
   </owner-list>
   <waiter-list>
    <waiter id="process1f713fb4108" mode="X" requestType="convert" />
   </waiter-list>
  </keylock>
 </resource-list>
</deadlock>

Here is the visual:

The deadlocked transactions are automatically retried and the orchestrations continue to run successfully, but this does create delays and also creates noise in the logs. Ideally there should be no deadlocks at all during the long-haul stress test.

bhugot · 2024-05-08T05:06:30Z

@cgillum you should try this again with last fix

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Deadlock during long-haul stress test #79

Deadlock during long-haul stress test #79

cgillum commented Apr 20, 2022

bhugot commented May 8, 2024

Deadlock during long-haul stress test #79

Deadlock during long-haul stress test #79

Comments

cgillum commented Apr 20, 2022

bhugot commented May 8, 2024