In [1]:
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# %                   SCF-TB - PROXY APPLICATION                      %
# %                   A.M.N. Niklasson, M. Kulichenko. T1, LANL       %
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# % Total Energy Function:                                            %
# % E = 2Tr[H0(D-D0)] + (1/2)*sum_i U_i q_i^2 +                       %
# %      + (1/2)sum_{i,j (i!=j)} q_i C_{ij} q_j - Efield*dipole       %
# % dipole = sum_i R_{i} q_i                                          %
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
from dftorch.DM_Fermi_x import DM_Fermi_x
import torch
import numpy as np
import sys
import os
### path to PYSEQM ###
sys.path.insert(1, "/home/maxim/Projects/git2/PYSEQM_dev/")
#from seqm.seqm_functions.read_xyz import read_xyz
import scipy.io as sio
import math
import pandas as pd
import importlib
#import matplotlib.pyplot as plt

import dftorch
#import importlib
import dftorch.CoulombMatrix
#importlib.reload(dftorch.CoulombMatrix)
from dftorch.CoulombMatrix import CoulombMatrix_vectorized, Ewald_Real_Space_vectorized
from dftorch.SCF import SCF, SCF_adaptive_mixing, SCFx

from dftorch.H0andS import H0_and_S_vectorized, H0_and_S_vectorized_OLD_FOR_POLY
from dftorch.Constants import Constants, ConstantsTest
from dftorch.nearestneighborlist import vectorized_nearestneighborlist
from dftorch.Energy import Energy, EnergyShadow
from dftorch.Tools import fractional_matrix_power_symm
from dftorch.Forces import Forces, ForcesShadow
from dftorch.BondIntegral import *
from dftorch.Tools import ordered_pairs_from_TYPE, calculate_dist_dips
from dftorch.Structure import Structure
from dftorch.DM_Fermi import DM_Fermi
from dftorch.Fermi_PRT import Canon_DM_PRT, Fermi_PRT
from dftorch.Kernel_Fermi import Kernel_Fermi
from dftorch.MD import initialize_velocities, kernel_update_lr

from dftorch.io import read_xyz_traj_data, read_xyz, write_XYZ_trajectory
import time
#import matplotlib.colors as mcolors

from dftorch.RepulsiveSpline import get_repulsion_energy
torch.set_default_dtype(torch.float32)
print(torch.cuda.memory_allocated() / 1e9, 'GB')
import sedacs

0.0 GB


In [2]:
from sedacs.ewald import calculate_PME_ewald, init_PME_data, calculate_alpha_and_num_grids, ewald_energy
from sedacs.neighbor_list import NeighborState, calculate_displacement

In [3]:
#torch._logging.set_logs(graph_code=True)
torch._dynamo.config.capture_dynamic_output_shape_ops = True

#H0_and_S_compiled = torch.compile(H0_and_S_vectorized, fullgraph=False, dynamic=True)


In [4]:
%%time
# Initial data, load atoms and coordinates, etc in COORD.dat
device = 'cuda'
#const = Constants('/home/maxim/Projects/DFTB/DFTorch/tests/sk_orig/mio-1-1/mio-1-1').to(device)
const = Constants('/home/maxim/Projects/DFTB/DFTorch/tests/sk_orig/ptbp/complete_set').to(device)
Efield = 0*0.3*torch.tensor([-.3,0.4,0.0], device=device).T # In arbitrary direction  Works ony in 0-field!!!


# species, coordinates = read_xyz(['3HDP_Ni.xyz'], sort=False) #Input coordinate file
# LBox = torch.tensor([60,61,62], device=device)

# species, coordinates = read_xyz(['protein.xyz'], sort=False) #Input coordinate file
# LBox = torch.tensor([100,100,100], device=device)

#species, coordinates = read_xyz(['COORD.xyz'], sort=False) #Input coordinate file
#LBox = torch.tensor([9,4,4], device=device)
#LBox = torch.tensor([20,20,20], device=device)
# LBox = torch.tensor([30,30,30], device=device)

species, coordinates = read_xyz(['water_30.xyz'], sort=False) #Input coordinate file
LBox = torch.tensor([30,30,30], device=device)



req_grad = False
TYPE = torch.tensor(species[0], dtype=torch.int64, device=device)
RX = torch.tensor(coordinates[0,:,0], device=device, dtype=torch.get_default_dtype(),
                    requires_grad=req_grad)
RY = torch.tensor(coordinates[0,:,1], device=device, dtype=torch.get_default_dtype(),
                    requires_grad=req_grad)
RZ = torch.tensor(coordinates[0,:,2], device=device, dtype=torch.get_default_dtype(),
                    requires_grad=req_grad)
R_tensor, R_orb, coeffs_tensor, R_rep_tensor, rep_splines_tensor = get_skf_tensors(TYPE, const)
const.set_params()

structure = Structure(TYPE, RX, RY, RZ, LBox, const, charge = 0, device=device)
structure.Te = 3000.0                       # Some electronic temperature in Kelvin, Possible bug at high tempertures!!!

structure.RX = (structure.RX) % structure.LBox[0]; 
structure.RY = (structure.RY) % structure.LBox[1];
structure.RZ = (structure.RZ) % structure.LBox[2];


# but first the neighborlist


nrnnlist, nndist, nnRx, nnRy, nnRz, nnType, nnStruct, nrnnStruct, \
        neighbor_I, neighbor_J, IJ_pair_type, JI_pair_type  = \
        vectorized_nearestneighborlist(structure.TYPE, structure.RX, structure.RY, structure.RZ, structure.LBox,
        8.0, structure.Nats, const, upper_tri_only=False, remove_self_neigh=False, min_image_only=False, verbose=True); # 8 for...
        
# Get Hamiltonian, Overlap, atomic DM = D0 (vector only), etc, 
D0, H0, dH0, S, dS = H0_and_S_vectorized(
                                structure.TYPE, structure.RX, structure.RY, structure.RZ, structure.Nats,
                                structure.diagonal, structure.H_INDEX_START,structure.H_INDEX_END, structure.Znuc,
                                nnRx, nnRy, nnRz, nnType,
                                const, neighbor_I, neighbor_J, IJ_pair_type, JI_pair_type,
                                R_orb, coeffs_tensor,
                                verbose=True)
#del nrnnlist, nndist, nnRx, nnRy, nnRz, nnType, nnStruct, nrnnStruct, neighbor_I, neighbor_J, IJ_pair_type, JI_pair_type


Vr, dVr = get_repulsion_energy(
                        R_rep_tensor, rep_splines_tensor,
                        structure.TYPE, structure.RX, structure.RY, structure.RZ, structure.LBox, 6.0, structure.Nats, 
                        const,verbose=True);



/home/maxim/Projects/DFTB/DFTorch/tests/sk_orig/ptbp/complete_set/H-H.skf
/home/maxim/Projects/DFTB/DFTorch/tests/sk_orig/ptbp/complete_set/H-O.skf
/home/maxim/Projects/DFTB/DFTorch/tests/sk_orig/ptbp/complete_set/O-H.skf
/home/maxim/Projects/DFTB/DFTorch/tests/sk_orig/ptbp/complete_set/O-O.skf




  t <neighbor list> 0.1 s

H0_and_S
  Do H off-diag
  t <dR and pair mask> 0.0 s

  t <SKF> 0.0 s

  Do H and S
  t <H and S> 0.1 s

  t <D0> 0.2 s

H0_and_S t 0.3 s

  t <neighbor list> 0.0 s

CPU times: user 3.12 s, sys: 275 ms, total: 3.4 s
Wall time: 1.24 s


In [5]:
%%time
dftorch_params = {
	'coul_method': 'PME',
	'Coulomb_acc': 1e-5, # coulomb accuracy for full coulomb calcs or t_err for PME
	'cutoff': 10.0, # coulomb cutoff
	'PME_order': 4,
				}

#with torch.no_grad():
H, Hcoul, Hdipole, KK, D, q, f, mu0, Ecoul, forces1, dq_p1 = SCFx(dftorch_params, structure, D0,
			H0, S, Efield, None,
			structure.RX, structure.RY, structure.RZ,
			structure.Nocc, structure.Hubbard_U, structure.Znuc, structure.Nats, structure.Te,
			alpha=0.5, MaxRank=30, start_Krylov=3,
			acc=1e-6, FelTol=1e-6, MAX_ITER=10, debug=False)

Etot,Eband0, Ecoul ,Edipole,S_ent = Energy(H0, structure.Hubbard_U, Efield, D0, dq_p1, D, q,
                                            structure.RX, structure.RY, structure.RZ, f, structure.Te) # Energy calculation - 2*Te*S_ent

del Hdipole
Etot+Vr ,Eband0,Ecoul,Edipole,S_ent, Vr

### Do SCF ###




torch.Size([3, 2652]) torch.Size([3])


W1116 18:30:34.251665 2141608 site-packages/torch/fx/experimental/symbolic_shapes.py:6679] [3/0] failed during evaluate_expr(Abs(zuf1 - 3536.0) > 1.0e-10, hint=None, size_oblivious=False, forcing_spec=False
E1116 18:30:34.253609 2141608 site-packages/torch/fx/experimental/recording.py:299] [3/0] failed while running evaluate_expr(*(Abs(zuf1 - 3536.0) > 1.0e-10, None, False, False), **{})


  Initial DM_Fermi

Starting cycle
Iter 1
  Hcoul 0.6 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s
Res = 13.989272118, dEb = 70938.484375000, dEc = 1093.848144531, t = 1.5 s

Iter 2
  Hcoul 0.0 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s
Res = 3.468567133, dEb = 253.984375000, dEc = 455.570312500, t = 0.9 s

Iter 3
  Hcoul 0.0 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s
Res = 0.941493034, dEb = 38.296875000, dEc = 87.200195312, t = 0.9 s

Iter 4
  Hcoul 0.0 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s


W1116 18:30:40.148687 2141608 site-packages/torch/_inductor/utils.py:1250] [5/0] Not enough SMs to use max_autotune_gemm mode
  check(


rank: 0 0.00830891914665699
rank: 1 0.001861787517555058
rank: 2 0.0002225761563749984
rank: 3 3.560906043276191e-05
rank: 4 9.993384992412757e-06
rank: 5 1.6907029021240305e-06
rank: 6 3.1383288501274365e-07
Res = 0.265531301, dEb = 98255.765625000, dEc = 23.096191406, t = 5.8 s

Iter 5
  Hcoul 0.0 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s
rank: 0 1.9876420992659405e-05
rank: 1 4.290845481591532e-06
rank: 2 9.28567033042782e-07
Res = 0.000111901, dEb = 4.017578125, dEc = 8.602783203, t = 2.0 s

Iter 6
  Hcoul 0.0 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s
rank: 0 1.7983969883061945e-05
rank: 1 4.110352165298536e-06
rank: 2 9.22394349345268e-07
Res = 0.000098142, dEb = 0.000000000, dEc = 0.001464844, t = 2.0 s

Iter 7
  Hcoul 0.0 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s
rank: 0 1.7813388694776222e-05
rank: 1 4.244232059136266e-06
rank: 2 9.348163985123392e-07
Res = 0.000099772, dEb = 0.001953125, dEc = 0.000488281, t = 2.0 s

Iter 8
  Hcoul 0.0 s
  DM_Fermi 0.8 s
  Z@Dorth@Z.T 0.0 s
rank: 0 1.6

(tensor(-96879., device='cuda:0'),
 tensor(-98317.8594, device='cuda:0'),
 tensor(519.3777, device='cuda:0'),
 tensor(-0., device='cuda:0'),
 tensor(6.8267e-06, device='cuda:0'),
 tensor(919.5223, device='cuda:0'))

In [6]:
Z = fractional_matrix_power_symm(S, -0.5)
with torch.no_grad():
    #Ftot, _, _, _, _, _, _, _ = \
    Ftot, Fcoul, Fband0, Fdipole, FPulay, FScoul, FSdipole, Frep = \
        Forces( H, H0, S, Z, dq_p1,
            D, D0,
            dH0, dS, None, dVr,
            Efield, structure.Hubbard_U, q,
            structure.RX, structure.RY, structure.RZ,
            structure.Nats, structure.H_INDEX_START, structure.H_INDEX_END, const, structure.TYPE)
    
Ftot = Ftot + forces1
Ftot.abs().max()

tensor(8.9842, device='cuda:0')

In [None]:
# Initial BC for n
torch.manual_seed(0)
#Fermi_PRT_compiled = torch.compile(Fermi_PRT, fullgraph=False, dynamic=False)
#kernel_update_lr_compiled  = torch.compile(kernel_update_lr, fullgraph=False, dynamic=False)
kernel_update_lr_compiled  = kernel_update_lr
cuda_sync = True
MaxRank = 15
FelTol = 1e-4;
F2V = 0.01602176487/1.660548782;
MVV2KE = 166.0538782/1.602176487;
KE2T = 1/0.000086173435;
fric = 0.0;

CALPHA, grid_dimensions = calculate_alpha_and_num_grids(structure.lattice_vecs.cpu().numpy(), dftorch_params['cutoff'], dftorch_params['Coulomb_acc'])
PME_data = init_PME_data(grid_dimensions, structure.lattice_vecs, CALPHA, dftorch_params['PME_order'])


temperature_K = 200.0

n = q; n_0 = q; n_1 = q; n_2 = q; n_3 = q; n_4 = q; n_5 = q;
mu_0 = mu0; mu_1 = mu0; mu_2 = mu0; mu_3 = mu0; mu_4 = mu0; mu_5 = mu0;
C0 = -6; C1 = 14; C2 = -8; C3 = -3; C4 = 4; C5 = -1; # Coefficients for modified Verlet integration
kappa = 1.82; alpha = 0.018;                         # Coefficients for modified Verlet integration
dt = 0.25;                              # Time step in fs
#VX = 0*structure.RX; VY = 0*structure.RX; VZ = 0*structure.RX;        # Initialize velocities
VX, VY, VZ = initialize_velocities(structure, temperature_K=temperature_K, remove_com=True, rescale_to_T=True, remove_angmom=True)
KK0 = KK;
K0Res = KK@(q-n);

atom_ids = torch.repeat_interleave(torch.arange(len(structure.n_orbitals_per_atom), device=S.device), structure.n_orbitals_per_atom) # Generate atom index for each orbital
E_array = []
T_array = []
Ek_array = []
Ep_array = []
Res_array = []

EPOT = Etot+Vr
for MD_step in range(5):  # MAIN MD LOOP

	if cuda_sync: torch.cuda.synchronize()

	#del D0, H0, dH0, S, dS, Vr, dVr, H, D
	start_time = time.perf_counter()
	print("########## Step = {:} ##########".format(MD_step, ))
	

	# OUTPUTS FOR SHADOW MD SIMULATIONS
	EKIN = 0.5*MVV2KE*torch.sum(structure.Mnuc*(VX**2+VY**2+VZ**2))           # Kinetic energy in eV (MVV2KE: unit conversion)
	Temperature = (2/3)*KE2T*EKIN/structure.Nats              # Statistical temperature in Kelvin
	Energ = EKIN + EPOT;                                # Total Energy in eV, Total energy fluctuations Propto dt^2
	Time = MD_step*dt;
	ResErr = torch.norm(q-n)/(structure.Nats**0.5)                      # ResErr Propto dt^2
	
	E_array.append(Energ.item())
	T_array.append(Temperature.item())
	Ek_array.append(EKIN.item())
	Ep_array.append(EPOT.item())
	Res_array.append(ResErr.item())

	if MD_step%5 == 0:
		comm_string = f"Etot = {Energ:.6f} eV, Epot = {EPOT:.6f} eV, Ekin = {EKIN:.6f} eV, T = {Temperature:.2f} K, Res = {ResErr:.6f}, mu = {mu0:.4f} eV\n"
		write_XYZ_trajectory('solvated_trj.xyz', structure, comm_string, step=MD_step)

	VX = VX + 0.5*dt*(F2V*Ftot[0]/structure.Mnuc) - fric*VX;      # First 1/2 of Leapfrog step
	VY = VY + 0.5*dt*(F2V*Ftot[1]/structure.Mnuc) - fric*VY;      # F2V: Unit conversion
	VZ = VZ + 0.5*dt*(F2V*Ftot[2]/structure.Mnuc) - fric*VZ;      # -c*V c>0 => Fricition
	#del Ftot

	# update positions and translate coordinates if go beyond box
	# Apply periodic boundary conditions
	structure.RX = (structure.RX + dt*VX) % structure.LBox[0]; 
	structure.RY = (structure.RY + dt*VY) % structure.LBox[1];
	structure.RZ = (structure.RZ + dt*VZ) % structure.LBox[2];

	if cuda_sync: torch.cuda.synchronize()
	tic2 = time.perf_counter()
	tic2_1 = time.perf_counter()

	nrnnlist, nndist, nnRx, nnRy, nnRz, nnType, nnStruct, nrnnStruct, \
			neighbor_I, neighbor_J, IJ_pair_type, JI_pair_type  = \
	vectorized_nearestneighborlist(structure.TYPE, structure.RX, structure.RY, structure.RZ, structure.LBox,
			8.0, structure.Nats, const, upper_tri_only=False, remove_self_neigh=False, min_image_only=False, verbose=False); # 8 for...
	
	print('nnType', nnType.shape)
			
	# Get Hamiltonian, Overlap, atomic DM = D0 (vector only), etc, 
	D0, H0, dH0, S, dS = H0_and_S_vectorized(
									structure.TYPE, structure.RX, structure.RY, structure.RZ, structure.Nats,
									structure.diagonal, structure.H_INDEX_START,structure.H_INDEX_END, structure.Znuc,
									nnRx, nnRy, nnRz, nnType,
									const, neighbor_I, neighbor_J, IJ_pair_type, JI_pair_type,
									R_orb, coeffs_tensor,
									verbose=False)
	#del nrnnlist, nndist, nnRx, nnRy, nnRz, nnType, nnStruct, nrnnStruct, neighbor_I, neighbor_J, IJ_pair_type, JI_pair_type

	if cuda_sync: torch.cuda.synchronize()
	print("2_1: {:.3f} s".format(time.perf_counter()-tic2_1))
	tic2_1 = time.perf_counter()

		
	Z = fractional_matrix_power_symm(S, -0.5)
	n = 2*n_0 - n_1 - kappa*K0Res + alpha*(C0*n_0+C1*n_1+C2*n_2+C3*n_3+C4*n_4+C5*n_5)
	n_5 = n_4; n_4 = n_3; n_3 = n_2; n_2 = n_1; n_1 = n_0; n_0 = n

	# Get full Coulomb matrix. In principle we do not need an explicit representation of the Coulomb matrix C!

	nbr_state = NeighborState(torch.stack((structure.RX, structure.RY, structure.RZ)), structure.lattice_vecs, None, dftorch_params['cutoff'], is_dense=True, buffer=0.0, use_triton=True)
	disps, dists, nbr_inds = calculate_dist_dips(torch.stack((structure.RX, structure.RY, structure.RZ)), nbr_state, dftorch_params['cutoff'])

	_, forces1, CoulPot =  calculate_PME_ewald(torch.stack((structure.RX, structure.RY, structure.RZ)),
	                    n,
	                    structure.lattice_vecs,
	                    nbr_inds,
	                    disps,
	                    dists,
	                    CALPHA,
	                    dftorch_params['cutoff'],
	                    PME_data,
	                	hubbard_u = structure.Hubbard_U,
	                    atomtypes = structure.TYPE,
	                    screening = 1,
	                    calculate_forces=1,
	                    calculate_dq=1,
	                )
	#Ecoul = ewald_e1 + 0.5 * torch.sum(n**2 * structure.Hubbard_U)
	

	
	if cuda_sync: torch.cuda.synchronize()
	print("2_2: {:.3f} s".format(time.perf_counter()-tic2_1))
	tic2_1 = time.perf_counter()
	
	#CoulPot = C @ n
	Hcoul_diag = structure.Hubbard_U[atom_ids] * n[atom_ids] + CoulPot[atom_ids]        
	Hcoul = 0.5 * (Hcoul_diag.unsqueeze(1) * S + S * Hcoul_diag.unsqueeze(0))
	H = H0 + Hcoul
	#Dorth,Q,e,f,mu0 = DM_Fermi_x((Z.T @ H @ Z).to(torch.float64), structure.Te, structure.Nocc, mu_0=None, m=18, eps=1e-9, MaxIt=50)
	Dorth,Q,e,f,mu0 = DM_Fermi_x((Z.T @ H @ Z), structure.Te, structure.Nocc, mu_0=None, m=18, eps=1e-9, MaxIt=50, debug=False)
	Dorth = Dorth.to(torch.get_default_dtype())
	D = Z @ Dorth @ Z.T
	DS = 2 * (D * S.T).sum(dim=1)  # same as DS = 2 * torch.diag(D @ S)
	q = -1.0 * structure.Znuc
	q.scatter_add_(0, atom_ids, DS) # sums elements from DS into q based on number of AOs, e.g. x4 p orbs for carbon or x1 for hydrogen
	#del Hcoul_diag, Hcoul, Dorth, DS

	q, H, Q, D, e, f, mu0 = calc_q(H0, structure.Hubbard_U[atom_ids], n[atom_ids], CoulPot[atom_ids],
								S, Z, structure.Te, structure.Nocc, structure.Znuc, atom_ids)

	if cuda_sync: torch.cuda.synchronize()
	print("2_3: {:.3f} s".format(time.perf_counter()-tic2_1))

	
	# Update Kernel %%

	if cuda_sync: torch.cuda.synchronize()
	print("2 HAM: {:.3f} s".format(time.perf_counter()-tic2))
	tic3 = time.perf_counter()

	NoRank = False;
	do_full_kernel = False
	Res = q - n


	if MD_step%1000 == 0 and do_full_kernel:
		###
		KK,_ = Kernel_Fermi(structure, mu0,structure.Te,structure.Nats,H,C,S,Z,Q,e)
		KK0 = KK;
		K0Res = KK0@Res;
		###
		1
	elif NoRank:
		#K0Res = KK0*Res
		KK0 = -0.2
		K0Res = KK0*Res
	else: # Preconditioned Low-Rank Krylov SCF acceleration
		K0Res = kernel_update_lr_compiled(structure, MaxRank, KK0, Res, q, FelTol, S,Z,
					 nbr_inds,disps,dists,CALPHA,dftorch_params,PME_data,
					 atom_ids,
					 Q, e, mu0)
	
	
	if cuda_sync: torch.cuda.synchronize()
	print("3 KER: {:.3f} s".format(time.perf_counter()-tic3))
	tic4 = time.perf_counter()

	
	Etot,Eband0,Ecoul,Edipole,S_ent = EnergyShadow(H0, structure.Hubbard_U, Efield, D0, CoulPot, D, q, n,	
                                            structure.RX, structure.RY, structure.RZ, f, structure.Te) # Energy calculation - 2*Te*S_ent
	
	

	Vr, dVr = get_repulsion_energy(
                        R_rep_tensor, rep_splines_tensor,
                        structure.TYPE, structure.RX, structure.RY, structure.RZ, structure.LBox, 6.0, structure.Nats,
                        const,verbose=False);
	
	if cuda_sync: torch.cuda.synchronize()
	tic4_3 = time.perf_counter()


	EPOT = Etot+Vr
	Ftot, _, Fband0, Fdipole, FPulay, FScoul, FSdipole, Frep = \
    	ForcesShadow( H, H0, S, Z, CoulPot,
            D, D0,
            dH0, dS, None, dVr,
            Efield, structure.Hubbard_U, q, n,
            structure.RX, structure.RY, structure.RZ,
            structure.Nats, structure.H_INDEX_START, structure.H_INDEX_END, const, structure.TYPE, verbose=False)
	
	Fcoul = forces1 * (2*q/n - 1.0)
	Ftot = Ftot + Fcoul
	if cuda_sync: torch.cuda.synchronize()
	print("4_3: {:.3f} s".format(time.perf_counter()-tic4_3))


	VX = VX + 0.5*dt*(F2V*Ftot[0]/structure.Mnuc) - fric*VX;      # Integrate second 1/2 of leapfrog step
	VY = VY + 0.5*dt*(F2V*Ftot[1]/structure.Mnuc) - fric*VY;      # - c*V  c > 0 => friction
	VZ = VZ + 0.5*dt*(F2V*Ftot[2]/structure.Mnuc) - fric*VZ;      
	print("ETOT = {:.8f}, EPOT = {:.8f}, EKIN = {:.8f}, T = {:.8f}, ResErr = {:.6f}, t = {:.1f} s\n".format(Energ, EPOT.item(), EKIN.item(),  Temperature.item(), ResErr.item(), time.perf_counter()-start_time ))
	print(torch.cuda.memory_allocated() / 1e9, 'GB\n')
	#del Q, Z


	if cuda_sync: torch.cuda.synchronize()
	print("4 F AND E: {:.3f} s".format(time.perf_counter()-tic4))
	#tic4 = time.perf_counter()


########## Step = 0 ##########
  t <neighbor list> 0.0 s

nnType torch.Size([2652, 233])
H0_and_S t 0.2 s

2_1: 0.277 s
torch.Size([3, 2652]) torch.Size([3])
2_2: 0.915 s
2_3: 0.929 s
2 HAM: 2.121 s


  check(


rank: 0, Fel = 0.027037
rank: 1, Fel = 0.004459
rank: 2, Fel = 0.000907
rank: 3, Fel = 0.000247
rank: 4, Fel = 0.000050
3 KER: 4.297 s
  t <neighbor list> 0.0 s

4_3: 1.265 s
ETOT = -96810.49218750, EPOT = -96878.55468750, EKIN = 68.50780487, T = 199.84896851, ResErr = 0.000000, t = 7.9 s

1.777565696 GB

4 F AND E: 1.335 s
########## Step = 1 ##########
  t <neighbor list> 0.0 s

nnType torch.Size([2652, 233])
H0_and_S t 0.2 s

2_1: 0.279 s
torch.Size([3, 2652]) torch.Size([3])
2_2: 0.825 s
2_3: 0.940 s
2 HAM: 2.044 s
rank: 0, Fel = 0.020684
rank: 1, Fel = 0.003041
rank: 2, Fel = 0.000502
rank: 3, Fel = 0.000124
rank: 4, Fel = 0.000026
3 KER: 1.575 s
  t <neighbor list> 0.0 s

4_3: 0.402 s
ETOT = -96810.29687500, EPOT = -96877.89062500, EKIN = 68.25820160, T = 199.12083435, ResErr = 0.004836, t = 4.1 s

1.777584128 GB

4 F AND E: 0.479 s
########## Step = 2 ##########
  t <neighbor list> 0.0 s

nnType torch.Size([2652, 233])
H0_and_S t 0.2 s

2_1: 0.281 s
torch.Size([3, 2652]) torch.S

In [None]:
ETOT = -96810.46474241, EPOT = -96878.67250706, EKIN = 68.45170116, T = 199.68528477, ResErr = 0.003606, t = 73.6 s
ETOT = -96810.44920009, EPOT = -96878.32649882, EKIN = 68.22330697, T = 199.01901998, ResErr = 0.002764, t = 85.8 s


In [8]:
-96899.94531250 + 68.61985779

-96831.32545471

In [9]:
-96899.95312500 + 68.61968994

-96831.33343506

In [None]:
lattice_vecs_np = np.array([[1.0, 0.0, 0.0],
                             [0.0, 1.0, 0.0],
                             [0.0, 0.0, 1.0]], dtype=np.float32) * 30
lattice_vecs = torch.from_numpy(lattice_vecs_np).to(device)

t_err = 1e-5

PME_order = 6

cutoff = 12.0

CALPHA, grid_dimensions = calculate_alpha_and_num_grids(lattice_vecs_np, cutoff, t_err)
PME_data = init_PME_data(grid_dimensions, lattice_vecs, CALPHA, PME_order)

nbr_state = NeighborState(torch.stack((structure.RX, structure.RY, structure.RZ)), lattice_vecs, None, cutoff, is_dense=True, buffer=0.0, use_triton=False)
disps, dists, nbr_inds = calculate_dist_dips(torch.stack((structure.RX, structure.RY, structure.RZ)), nbr_state, cutoff)

Ra = torch.stack((RX.unsqueeze(-1), RY.unsqueeze(-1), RZ.unsqueeze(-1)))
Rb = torch.stack((nnRx, nnRy, nnRz))
Rab = Rb - Ra
alpha, CALPHA, PME_data

torch.Size([3, 2652]) torch.Size([3])




(0.2741119282287687,
 0.28275585101729633,
 ([60, 60, 60],
  tensor([[[0.0000e+00, 4.6099e-03, 7.5618e-04,  ..., 1.6651e-04,
            7.5618e-04, 4.6099e-03],
           [4.6099e-03, 2.0029e-03, 5.2567e-04,  ..., 1.3022e-04,
            5.2567e-04, 2.0029e-03],
           [7.5618e-04, 5.2567e-04, 2.1557e-04,  ..., 6.5725e-05,
            2.1557e-04, 5.2567e-04],
           ...,
           [1.6651e-04, 1.3022e-04, 6.5725e-05,  ..., 2.3518e-05,
            6.5725e-05, 1.3022e-04],
           [7.5618e-04, 5.2567e-04, 2.1557e-04,  ..., 6.5725e-05,
            2.1557e-04, 5.2567e-04],
           [4.6099e-03, 2.0029e-03, 5.2567e-04,  ..., 1.3022e-04,
            5.2567e-04, 2.0029e-03]],
  
          [[4.6099e-03, 2.0029e-03, 5.2567e-04,  ..., 1.3022e-04,
            5.2567e-04, 2.0029e-03],
           [2.0029e-03, 1.1603e-03, 3.8065e-04,  ..., 1.0287e-04,
            3.8065e-04, 1.1603e-03],
           [5.2567e-04, 3.8065e-04, 1.6651e-04,  ..., 5.3032e-05,
            1.6651e-04, 3.8065e

In [None]:
ewald_e1, forces1, dq_p1 =  calculate_PME_ewald(torch.stack((structure.RX, structure.RY, structure.RZ)),
                    q,
                    lattice_vecs,
                    nbr_inds,
                    disps,
                    dists,
                    CALPHA,
                    cutoff,
                    PME_data,
                	hubbard_u = structure.Hubbard_U,
                    atomtypes = structure.TYPE,
                    screening = 1,
                    calculate_forces=1,
                    calculate_dq=1,
                )
ewald_e1 + 0.5 * torch.sum(q**2 * structure.Hubbard_U), Ecoul

(tensor(515.1045, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(515.1074, device='cuda:0'))

In [23]:
-0.1%4

3.9

In [15]:
with torch.no_grad():
	
    #Ftot, _, _, _, _, _, _, _ = \
    Ftot_, Fcoul_, Fband0_, Fdipole_, FPulay_, FScoul_, FSdipole_, Frep_ = \
        Forces( H, H0, S, dq_p1,
            D, D0,
            dH0, dS, None, dVr,
            Efield, structure.Hubbard_U, q,
            structure.RX, structure.RY, structure.RZ,
            structure.Nats, structure.H_INDEX_START, structure.H_INDEX_END, const, structure.TYPE, verbose=True)
Ftot.abs().max()

Skipping Fcoul, done in PME.
Doing FScoul
Doing Fband0
Doing Pulay
Doing Fdipole
Doing FSdipole
Doing Repulsion
Forces t 0.8 s



tensor(8.9876, device='cuda:0')

In [20]:
(FScoul - FScoul_).abs().max(), (Fcoul - forces1).abs().max()

(tensor(6.5207e-05, device='cuda:0'), tensor(1.2316e-05, device='cuda:0'))

In [None]:
# PME Coulomb energy diagnostic (unified parameters)
ewald_e_PME, forces_PME, dq_PME = calculate_PME_ewald(
    torch.stack((structure.RX, structure.RY, structure.RZ)),
    q,
    lattice_vecs,
    nnType,
    Rab,
    nndist,
    CALPHA,
    cutoff,
    PME_data,
    hubbard_u=structure.Hubbard_U,
    atomtypes=structure.TYPE,
    screening=1,
    calculate_forces=1
)

ewald_e_PME + 0.5 * torch.sum(q**2 * structure.Hubbard_U), Ecoul

W1114 13:20:25.461464 1932791 site-packages/torch/_inductor/utils.py:1250] [3/1] Not enough SMs to use max_autotune_gemm mode


(tensor(515.1045, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(515.1072, device='cuda:0'))

In [21]:
CoulPot = C @ q
FScoul = torch.zeros((3, Nats), dtype=dtype, device=device)
factor = (U * q + CoulPot)*2
dS_times_D = D*dS*factor[atom_ids].unsqueeze(-1)
dDS_XYZ_row_sum = torch.sum(dS_times_D, dim = 2) # sum of elements in each row
FScoul.scatter_add_(1, atom_ids.expand(3, -1), dDS_XYZ_row_sum) # sums elements from DS into q based on number of AOs, e.g. x4 p orbs for carbon or x1 for hydrogen
dDS_XYZ_col_sum = torch.sum(dS_times_D, dim=1)
FScoul.scatter_add_(1, atom_ids.expand(3, -1), -dDS_XYZ_col_sum)


NameError: name 'Nats' is not defined

In [29]:
C.dim(), CoulPot.dim()

(2, 1)

tensor(0.0001, device='cuda:0')

In [18]:
(Fcoul - forces1).abs().max()

tensor(1.2383e-05, device='cuda:0')