In [1]:
import pandas as pd
import os
import opendatasets as od
from tqdm import tqdm
from pathlib import Path
import shutil
import random
from coulumb_utils.xyz import read_xyz, parse_xyz_file, create_dataframe_from_xyz
from coulumb_utils.calculate_coulumb_matrix import calculate_coulomb_matrix
from coulumb_utils.sort_molecules import sort_by_atomic_number, sort_by_row_norm
from coulumb_utils.normalize import normalize_min_max, log_normalize
from coulumb_utils.eigen_padd import compute_eigenvalues, padd_eig, padd_matrix
from coulumb_utils.standardize import standardize_matrix
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib.ticker import MaxNLocator,ScalarFormatter
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn as nn
import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from sklearn.linear_model import Ridge
from xgboost import plot_importance

# Download dataset

The dataset is on Kaggle - https://www.kaggle.com/datasets/mariovozza5/qm9-molecules

In [10]:

dataset = "https://www.kaggle.com/datasets/mariovozza5/qm9-molecules"

if not os.path.exists("qm9-molecules"): od.download(dataset)

Dataset URL: https://www.kaggle.com/datasets/mariovozza5/qm9-molecules
Downloading qm9-molecules.zip to ./qm9-molecules


100%|██████████| 130M/130M [00:17<00:00, 7.87MB/s] 





# Load xyz files

In [2]:
work_path = Path("./qm9-molecules/")
data_path = work_path.joinpath("data")

subset_size = 1000

df = create_dataframe_from_xyz(data_path, subset_size)
df_sorted = df.sort_values('index', ascending=True)

df_sorted.head(10)

100%|██████████| 1000/1000 [00:00<00:00, 32804.91it/s]


Unnamed: 0,filename,n_atoms,index,A,B,C,mu,alpha,homo,lumo,gap,R2,zpve,U0,U,H,G,Cv
579,qm9_272,18,272,6.8197,1.72205,1.47522,1.3839,60.25,-0.2613,0.0807,0.342,850.8528,0.164714,-272.837559,-272.829406,-272.828462,-272.869568,28.734
835,qm9_637,16,637,6.01713,2.59419,2.00512,1.5102,56.58,-0.2585,0.0791,0.3376,658.5353,0.142089,-271.619912,-271.612845,-271.611901,-271.650485,25.76
211,qm9_1014,9,1014,9.83231,2.03584,1.68662,2.7733,52.24,-0.2763,-0.0623,0.214,648.9718,0.057354,-338.273162,-338.268017,-338.267072,-338.302227,17.914
700,qm9_1017,12,1017,9.0876,1.91529,1.58189,2.834,65.16,-0.2054,0.0167,0.2222,727.2033,0.091471,-286.24688,-286.240878,-286.239934,-286.276357,23.043
429,qm9_1067,14,1067,7.43399,1.95952,1.57102,1.735,59.38,-0.1898,0.0312,0.221,760.656,0.114923,-324.589539,-324.583091,-324.582147,-324.619783,23.926
924,qm9_1100,17,1100,3.2778,2.35057,1.96982,2.5591,60.44,-0.2405,-0.0104,0.2301,764.3556,0.144105,-346.883049,-346.874108,-346.873164,-346.91654,31.978
581,qm9_1132,21,1132,4.06437,1.54353,1.36761,1.0249,71.72,-0.2419,0.0805,0.3224,1021.1314,0.191749,-312.131398,-312.121927,-312.120983,-312.165172,34.217
313,qm9_1303,16,1303,3.46915,2.57533,2.20806,2.4567,61.5,-0.2609,0.0323,0.2932,703.5227,0.136267,-325.785262,-325.777909,-325.776964,-325.816273,27.018
722,qm9_1314,17,1314,3.23345,2.60446,2.10715,1.0048,60.09,-0.2515,0.0751,0.3266,720.2405,0.146617,-346.844016,-346.836105,-346.835161,-346.875318,29.716
57,qm9_1341,19,1341,4.9025,1.71312,1.55301,0.965,68.75,-0.2406,0.0932,0.3339,897.6244,0.169391,-310.901659,-310.893126,-310.892182,-310.933912,31.118
