In [36]:
# Constructs a quantum circuit g with parameters θ, then differentiates the recursive algorithm given in Section 5.1 of https://arxiv.org/abs/1112.2184 to obtain the gradient of p_θ(x) wrt θ, where x is a measurement of g|0>. The differentiation takes polynomial time due to memoization.
# We then compare our results to the finite difference gradient

using Yao, FLOYao
using LinearAlgebra

nq = 10 #Number of qubits
layers = 2 #Number of brick-wall layers in the circuit
g = chain(nq)
for _ in 1:layers
    for i in 1:nq-1
        push!(g, rot(kron(nq, i => X, i+1 => X), 0.)) #Nearest-neighbor XX rotation gates
    end
    for i in 1:nq-1
        push!(g, rot(kron(nq, i => X, i+1 => Y), 0.)) #Nearest-neighbor XY rotation gates
    end
    for i in 1:nq
        push!(g, put(nq, i => Rz(0.))) #Single qubit Z rotation gates
    end
end

#Set g to have random parameters
p = rand(nparameters(g)).*2π
dispatch!(g, p)
nparams = nparameters(g)
dim = 2*nq
println("number of parameters: ", nparams)

⊗ = kron

function covariance_matrix(reg::MajoranaReg)
    nq = nqubits(reg)
    G = I(nq) ⊗ [0 1; -1 0]
    return reg.state * G * reg.state'
end

function majoranaindices2kron(nq, i, j) #Returns γ_iγ_j, assuming that i≠j
    p = []
    c = (i % 2 == j % 2) ? 1 : -1
    a = min(i, j)
    b = max(i, j)
    first = (a+1) ÷ 2 
    last = (b+1) ÷ 2 
    if first == last #This means i=j-1 and j is even
        c = 1
        push!(p, first => Z)
    else
        if i % 2 == 0
            push!(p, first => X)
            c *= 1
        else
            push!(p, first => Y)
            c *= -1
        end
        for k in first+1:last-1
            push!(p, k => Z)
            c *= -1
        end
        if j % 2 == 0
            push!(p, last => Y)
        else
            push!(p, last => X)
        end
    end
    if i > j
        c *= -1
    end
    return c*kron(nq, p...)
end

function majorana_commutator(nq, i, j) #Returns [γ_i,γ_j]=2γ_iγ_j, due to the anti-commutation of Majorana operators. It needs to be an 'Add' object so that the Yao.expect' function can take it in as input.
    return Add(majoranaindices2kron(nq, i, j)) 
end

function update!(reg::MajoranaReg, theta, b, temp_m, temp_grad_m, cur_m, cur_grad_m, probabilities, grad_probabilities) #Evolves all matrices and probabilities and gradients by nq steps, in-place. This method is slow but is definitely correct. I used this to check that my more optimal function was outputting the correct thing.
    t_tot = 0
    for i in 1:nq
        t = time()
        if i > 1
            cur_m = deepcopy(temp_m)
            cur_grad_m = deepcopy(temp_grad_m)
            cur_prob = deepcopy(probabilities[i-1])
            cur_grad_prob = deepcopy(grad_probabilities[i-1, :])
            ni = b[i-1]
            for p in 1:dim
                for q in p+1:dim
                    temp_grad_m[p,q] .-= (-1)^ni * ((-cur_grad_prob * cur_m[2*(i-1)-1,p] * cur_m[2*(i-1),q]) .+ (cur_prob * (cur_grad_m[2*(i-1)-1,p]*cur_m[2*(i-1),q] .+ cur_m[2*(i-1)-1,p] * cur_grad_m[2*(i-1),q]))) / (2*cur_prob^2)
                    temp_grad_m[p,q] .+= (-1)^ni * ((-cur_grad_prob * cur_m[2*(i-1)-1,q] * cur_m[2*(i-1),p]) .+ (cur_prob * (cur_grad_m[2*(i-1)-1,q]*cur_m[2*(i-1),p] .+ cur_m[2*(i-1)-1,q] * cur_grad_m[2*(i-1),p]))) / (2*cur_prob^2)
                    temp_grad_m[q,p] = -temp_grad_m[p,q] 
                end
            end
            for p in 1:dim
                temp_grad_m[p,p] = zeros(nparams)
            end
            for p in 1:dim
                for q in p+1:dim
                    temp_m[p,q] -= (-1)^ni * (cur_m[2*(i-1)-1,p] * cur_m[2*(i-1),q]) / (2*cur_prob)
                    temp_m[p,q] += (-1)^ni * (cur_m[2*(i-1)-1,q] * cur_m[2*(i-1),p]) / (2*cur_prob)
                    temp_m[q,p] = -temp_m[p,q]
                end
            end
            for p in 1:dim
                temp_m[p,p] = 0.0
            end
            ni = b[i]
            probabilities[i] = (1+(-1)^ni * temp_m[2*i-1, 2*i]) / 2
            grad_probabilities[i, :] = (-1)^ni * temp_grad_m[2*i-1, 2*i] / 2
        else
            dispatch!(g, theta)
            temp_m = covariance_matrix(apply(reg, g))
            ni = b[i]
            probabilities[i] = (1+(-1)^ni * temp_m[2*i-1, 2*i]) / 2
            for p in 1:dim
                for q in p+1:dim
                    ham = majorana_commutator(nq, p, q)
                    temp_grad_m[p,q] = expect'(ham, reg => g)[2]
                    temp_grad_m[q,p] = -temp_grad_m[p,q]
                end
            end
            for p in 1:dim
                temp_grad_m[p,p] = zeros(nparams)
            end
            grad_probabilities[i, :] = (-1)^ni * temp_grad_m[2*i-1, 2*i] / 2
        end
        diff = time() - t
        t_tot += diff
        println("iteration $i: $diff")
    end
    println("total time: $t_tot")
end 

function update_opt!(reg::MajoranaReg, theta, b, temp_m, temp_grad_m, probabilities, grad_probabilities) #Evolves all matrices and probabilities and gradients by nq steps, in-place and optimally
    t_tot = 0
    for i in 1:nq
        t = time()
        if i > 1
            ni = b[i-1]
            cur_prob = probabilities[i-1]
            cur_grad_prob = grad_probabilities[:, i-1]
            cur_prefactor = (-1)^ni / (2*cur_prob)
            cur_grad_prefactor = (-1)^ni / (2*cur_prob^2)
            for p in 2*(i-1)+1:dim
                for q in p+1:dim
                    temp_grad_m[:,p,q] .-= cur_grad_prefactor * ((-cur_grad_prob * temp_m[2*(i-1)-1,p] * temp_m[2*(i-1),q]) .+ (cur_prob * (temp_grad_m[:,2*(i-1)-1,p]*temp_m[2*(i-1),q] .+ temp_m[2*(i-1)-1,p] * temp_grad_m[:,2*(i-1),q])))
                    temp_grad_m[:,p,q] .+= cur_grad_prefactor * ((-cur_grad_prob * temp_m[2*(i-1)-1,q] * temp_m[2*(i-1),p]) .+ (cur_prob * (temp_grad_m[:,2*(i-1)-1,q]*temp_m[2*(i-1),p] .+ temp_m[2*(i-1)-1,q] * temp_grad_m[:,2*(i-1),p])))
                end
            end
            for p in 2*(i-1)+1:dim
                for q in p+1:dim
                    temp_m[p,q] -= cur_prefactor * (temp_m[2*(i-1)-1,p] * temp_m[2*(i-1),q])
                    temp_m[p,q] += cur_prefactor * (temp_m[2*(i-1)-1,q] * temp_m[2*(i-1),p])
                end
            end
            ni = b[i]
            probabilities[i] = (1+(-1)^ni * temp_m[2*i-1, 2*i]) / 2
            grad_probabilities[:, i] = (-1)^ni * temp_grad_m[:,2*i-1, 2*i] / 2
        else
            dispatch!(g, theta)
            temp_m = covariance_matrix(apply(reg, g))
            ni = b[i]
            probabilities[i] = (1+(-1)^ni * temp_m[2*i-1, 2*i]) / 2
            for p in 1:dim
                for q in p+1:dim
                    ham = majorana_commutator(nq, p, q) 
                    temp_grad_m[:,p,q] = expect'(ham, reg => g)[2]
                end
            end
            grad_probabilities[:, i] = (-1)^ni * temp_grad_m[:,2*i-1, 2*i] / 2
        end
        diff = time() - t
        t_tot += diff
        println("iteration $i: $diff")
    end
    println("total time: $t_tot")
end

function log_grad(reg::MajoranaReg, theta, b, temp_m, temp_grad_m, cur_m, cur_grad_m, probabilities, grad_probabilities) #Returns ∇_θlog(p_θ(b)), evaluated at 'theta' (parameters of circuit) and 'b' (measurement result); 'reg' is the initial register and must be of type MajoranaReg (e.g. FLOYao.zero_state(nq))
    nq = nqubits(reg)
    update!(reg, theta, b, temp_m, temp_grad_m, cur_m, cur_grad_m, probabilities, grad_probabilities)
    s = zeros(length(theta))
    for i in 1:nq
        s += grad_probabilities[i, :] / probabilities[i]
    end
    basic_prob = probabilities
    return basic_prob, s
end

function log_grad_opt(reg::MajoranaReg, theta, b, temp_m, temp_grad_m, probabilities, grad_probabilities) #Returns ∇_θlog(p_θ(b)), evaluated at 'theta' (parameters of circuit) and 'b' (measurement result); 'reg' is the initial register and must be of type MajoranaReg (e.g. FLOYao.zero_state(nq)). This uses the optimal updating function which is more efficient but still outputs the same thing as the original update! function.
    update_opt!(reg, theta, b, temp_m, temp_grad_m, probabilities, grad_probabilities)
    s = zeros(length(theta))
    for i in 1:nq
        s += grad_probabilities[:, i] / probabilities[i]
    end
    optimized_prob = probabilities
    return optimized_prob, s
end

reg = apply(FLOYao.zero_state(nq), g)
bitstr = measure(reg, nshots = 1)[1] #Random measurement of g|0>
println("measured outcome: $bitstr")
println("probability of measuring the above outcome: ", FLOYao.bitstring_probability(reg, bitstr)) #Uses FLOYao.bitstring_probability(reg, bitstr) which is known to be correct. We check this number against our algorithm output, to verify correctness.

T = Float64 #Can also be BigFloat, may experiment with other data types later
println("data type used in calculations: $T") 
println("note: the time (μs) taken for 'iteration i' refers to the time required for the algorithm to compute p_θ(x_i|x_1,...x_{i-1}) and ∇_θ(p_θ(x_i|x_1,...x_{i-1}))")

#Initializing temporary matrices and vectors used in the unoptimal version of the algorithm.
temp_m = Matrix{T}(undef, dim, dim)
temp_grad_m = Matrix{Vector{T}}(undef, dim, dim)
cur_m = Matrix{T}(undef, dim, dim)
cur_grad_m = Matrix{Vector{T}}(undef, dim, dim)
probabilities = Vector{T}(undef, nq)
grad_probabilities = Matrix{T}(undef, nq, nparams)

basic_prob, basic = log_grad(FLOYao.zero_state(nq), p, bitstr, temp_m, temp_grad_m, cur_m, cur_grad_m, probabilities, grad_probabilities)
println("The ith entry in the following vector is p_θ(x_i|x_1,...x_{i-1})")
println(basic_prob)
println("the product of all entries in the above vector, should match the earlier probability computed using FLOYao.bitstring_probability: ", prod(basic_prob))
println("The following vector is ∇_θ(log(p_θ(x))), evaluated at x = measured outcome")
println(basic)

number of parameters: 56
measured outcome: 1001001111 ₍₂₎
probability of measuring the above outcome: 0.01191862824838763386814286332458357258516754616498600101682417871875535006253499
data type used in calculations: Float64
note: the time (μs) taken for 'iteration i' refers to the time required for the algorithm to compute p_θ(x_i|x_1,...x_{i-1}) and ∇_θ(p_θ(x_i|x_1,...x_{i-1}))
iteration 1: 0.15009498596191406
iteration 2: 0.025625944137573242
iteration 3: 0.030073165893554688
iteration 4: 0.043564796447753906
iteration 5: 0.02081298828125
iteration 6: 0.02620410919189453
iteration 7: 0.08911585807800293
iteration 8: 0.021461009979248047
iteration 9: 0.023405075073242188
iteration 10: 0.027287960052490234
total time: 0.45764589309692383
The ith entry in the following vector is p_θ(x_i|x_1,...x_{i-1})
[0.43899921201363395, 0.7197308827565566, 0.8849153744764573, 0.9313605633823401, 0.6357015089123733, 0.4741299671024828, 0.5988245052535328, 0.5018248677965925, 0.5053245744598883, 1.0]

In [37]:
#Initializing temporary matrices and vectors for the optimized version of the algorithm.
temp_m = Matrix{T}(undef, dim, dim)
temp_grad_m = Array{T}(undef, nparams, dim, dim)
probabilities = Vector{T}(undef, nq)
grad_probabilities = Matrix{T}(undef, nparams, nq)

#Calling the optimized version of the algorithm. 'optimized_prob' represents the vector with ith entry p_θ(x_i|x_1,...x_{i-1}) and 'optimized' is ∇_θ(log(p_θ(x))).
optimized_prob, optimized = log_grad_opt(FLOYao.zero_state(nq), p, bitstr, temp_m, temp_grad_m, probabilities, grad_probabilities)

#Checking that the output of the optimized algorithm still matches the output of the unoptimized algorithm.
println("probabilities equal? ", basic_prob == optimized_prob)
println("grad(log p) equal? ", basic == optimized)
println("The following vector is ∇_θ(log(p_θ(x))), evaluated at x = measured outcome, as outputted by the optimized version of the algorithm.")
println(optimized)

iteration 1: 0.02676987648010254
iteration 2: 0.011457204818725586
iteration 3: 0.013464927673339844
iteration 4: 0.00638890266418457
iteration 5: 0.004697084426879883
iteration 6: 0.003204822540283203
iteration 7: 0.0020678043365478516
iteration 8: 0.0011260509490966797
iteration 9: 0.0004799365997314453
iteration 10: 0.00011515617370605469
total time: 0.06977176666259766
probabilities equal? true
grad(log p) equal? true
The following vector is ∇_θ(log(p_θ(x))), evaluated at x = measured outcome, as outputted by the optimized version of the algorithm.
[0.10092087087038887, 0.6798065966875954, -0.8790270448936839, -0.3738030143218094, -1.0064255329407354, 0.0897390759476247, 0.6428281329802226, -0.5102809159693512, 0.1666703496042421, 0.3425785939350287, 0.2998530874976961, 0.15699192715233562, -0.20129316787105253, 0.6841024613863167, 0.8715144194581189, -1.1026588984555359, -0.8140235221857535, -0.6178184346962323, -0.3650457820334253, 0.5422172300569149, -0.20094336103114047, 0.2506

In [40]:
println(probabilities)
a, b = log_grad_opt(FLOYao.zero_state(nq), p, bitstr, temp_m, temp_grad_m, probabilities, grad_probabilities)
println(b == optimized)
b

[0.43899921201363395, 0.7197308827565566, 0.8849153744764573, 0.9313605633823401, 0.6357015089123733, 0.4741299671024828, 0.5988245052535328, 0.5018248677965925, 0.5053245744598883, 1.0]
iteration 1: 0.1521899700164795
iteration 2: 0.01124882698059082
iteration 3: 0.010317087173461914
iteration 4: 0.009216070175170898
iteration 5: 0.00569605827331543
iteration 6: 0.012009143829345703
iteration 7: 0.0022809505462646484
iteration 8: 0.0022170543670654297
iteration 9: 0.0005359649658203125
iteration 10: 0.00016117095947265625
total time: 0.2058722972869873
true


56-element Vector{Float64}:
  0.10092087087038887
  0.6798065966875954
 -0.8790270448936839
 -0.3738030143218094
 -1.0064255329407354
  0.0897390759476247
  0.6428281329802226
 -0.5102809159693512
  0.1666703496042421
  0.3425785939350287
  ⋮
  2.33959419824486e-17
  7.836062459860908e-18
  7.748419334825296e-18
 -6.959245189227767e-17
 -1.122311017586188e-16
 -7.955072706288029e-17
  9.344145672980282e-17
 -3.793836463263715e-17
 -2.3036548444059646e-17

In [39]:
#Comparison with finite difference method
using LinearAlgebra

function prob(theta, x) #Outputs p_θ(x), the probability of measuring an outcome of 'x' for the state g|0> where the parameters of g are set to 'theta'
    circuit = dispatch(g, theta)
    r = apply(FLOYao.zero_state(nq), circuit)
    return FLOYao.bitstring_probability(r, x)
end

eps_default = 1e-8
function fe_grad_prob(theta, x, eps = eps_default) #Computes the finite-difference approximation for ∇_θ(log(p_θ(x))), evaluated at 'x'. I mainly used this to verify the correctness of the algorithm
    temp_params = copy(theta)
    fe_grad = Vector{Float64}(undef, length(theta))
    for i in 1:nparameters(g)
        temp_params[i] += eps
        plus = log(prob(temp_params, x))
        temp_params[i] -= 2*eps
        minus = log(prob(temp_params, x))
        fe_grad[i] = (plus - minus) / (2*eps)
        temp_params[i] += eps #Resetting temp_params[i] back to original value
    end
    fe_grad
end

println("algorithm output for ∇_θ(log(p_θ(x))), should be exact")
println(basic)
fe = fe_grad_prob(p, bitstr)
println("finite difference approximation to ∇_θ(log(p_θ(x)))")
println(fe)
println("l2 distance between algorithm output and finite difference approximation: ")
println(norm(basic - fe))

algorithm output for ∇_θ(log(p_θ(x))), should be exact
[0.10092087087038887, 0.6798065966875954, -0.8790270448936839, -0.3738030143218094, -1.0064255329407354, 0.0897390759476247, 0.6428281329802226, -0.5102809159693512, 0.1666703496042421, 0.3425785939350287, 0.2998530874976961, 0.15699192715233562, -0.20129316787105253, 0.6841024613863167, 0.8715144194581189, -1.1026588984555359, -0.8140235221857535, -0.6178184346962323, -0.3650457820334253, 0.5422172300569149, -0.20094336103114047, 0.2506014418193112, 0.49392408176587993, 0.24738072943675124, 0.2076654179644484, 0.88475735802792, -0.04133039591419144, -0.46146339623851057, 0.3726015648629357, -0.3458845857460146, 0.3999608840330077, -0.15708729754609427, 1.0737047027763378, -0.3226742601067036, -0.8996184113515739, -0.2119756032504698, 0.7716418528189506, 0.5275690019640422, 0.3728833377411375, 0.2764913560869765, 0.16449803034350383, -0.882895287105675, -0.07440799568162802, -0.14367267895169733, -0.9093893800475488, 0.024588078240