Skip to content

Commit

Permalink
Merge pull request #84 from nlesc-nano/dev
Browse files Browse the repository at this point in the history
miscellaneus scripts
  • Loading branch information
felipeZ committed May 19, 2021
2 parents 6ed5abd + 12855f3 commit 6cb80e6
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 21 deletions.
32 changes: 11 additions & 21 deletions scripts/extract_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
import pandas as pd

MAP_NAMES = {"11k": 11043, "1k": 1000, "2k": 2000, "500": 500, "5k": 5000, "7k": 7500}
MODELS = ("Fingerprints", "MPNN", "SE3Transformer")


def read_info(file_name: Path, column: str = "rvalue") -> np.ndarray:
"""Read the rvalues from the table."""
with open(file_name, 'r') as f:
data = f.readlines()

# Indices of the columns with the regression info
names = {"slope": 0, "intercept": 1, "rvalue": 2, "stderr": 3}
index = names[column]
Expand Down Expand Up @@ -53,11 +54,10 @@ def extract_subfolder(folder: Path, nfiles: int, nsamples: int) -> pd.DataFrame:
def extract_all_results(root: Path) -> None:
"""Extract all the rvalues from all the subfolders."""
folders = [p for p in root.iterdir() if p.is_dir()]
models = ("Fingerprints", "MPNN", "SE3Transformer")
results = {}
for directory in folders:
nsamples = MAP_NAMES[directory.name]
for model in models:
for model in MODELS:
workdir = directory / model / "Results"
nresults = count_results(workdir)
df = extract_subfolder(workdir, nresults, nsamples)
Expand All @@ -69,33 +69,23 @@ def extract_all_results(root: Path) -> None:

new_results = process_data(results)
plot_results_all_models(new_results)
for name, df in new_results.items():
plot_results_for_model(name, df)


def plot_results_all_models(results):
# Create subplots to accomodate the results
_, axis = plt.subplots(nrows=5, ncols=3, figsize=(20, 20), constrained_layout=True)

# merge the models results
columns = results["Fingerprints"].columns
for k, c in enumerate(columns):
data = pd.concat([df[c] for df in results.values()], axis=1)
sns.lineplot(data=data, ax=axis[k // 3][k % 3])
names = results["Fingerprints"].columns
for k, prop in enumerate(names):
ax = axis[k // 3][k % 3]
frames = pd.DataFrame({m: results[m][names[k]] for m in MODELS})
obj = sns.lineplot(data=frames, markers=True, ax=ax)
obj.set_title(prop)
plt.xlabel("Number of samples")
plt.ylabel("rvalue")
plt.savefig("scaling.png")


def plot_results_for_model(name: str, df: pd.DataFrame):
# Create subplots to accomodate the results
_, axis = plt.subplots(nrows=5, ncols=3, figsize=(20, 20), constrained_layout=True)

# merge the models results
for k, c in enumerate(df.columns):
sns.lineplot(data=df[c], ax=axis[k // 3][k % 3])
plt.suptitle(f"Scaling for {name}", fontsize=14)
plt.savefig(f"{name}_scaling.png")


def process_data(results):
new = {}
for k, df in results.items():
Expand Down
71 changes: 71 additions & 0 deletions scripts/plot_means.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.core.algorithms import mode
import seaborn as sns
import json
from pathlib import Path

MODELS = ("Fingerprints", "MPNN", "SE3Transformer")
NSAMPLES = ("11k", "1k", "2k", "500", "5k", "7k")
MAP_NAMES = {"11k": 11043, "1k": 1000, "2k": 2000, "500": 500, "5k": 5000, "7k": 7500}
PATH_GROUND_TRUE = Path("data/Carboxylic_acids/CDFT/cdft_random_500.csv")
MSE_FILE = "MSE.json"


def read_data():
ground_true = pd.read_csv(PATH_GROUND_TRUE, index_col=0)
ground_true.drop("smiles", axis=1, inplace=True)
results = {}
for m in MODELS:
results[m] = {}
for n in NSAMPLES:
df = pd.read_json(f"means_{m}_{n}.json")
new = (df - ground_true) ** 2
mse = new.sum() / len(df)
results[m][n] = mse.to_dict()

with open(MSE_FILE, 'w') as f:
json.dump(results, f, indent=4)


def plot_data(model: str):
with open(MSE_FILE, 'r') as f:
data = json.load(f)

data = [pd.DataFrame(transpose_data(data[m])) for m in MODELS]
for df in data:
df.sort_index(inplace=True)

names = data[0].columns
_, axis = plt.subplots(nrows=5, ncols=3, figsize=(20, 20), constrained_layout=True)

for k, prop in enumerate(names):
ax = axis[k // 3][k % 3]
frames = pd.DataFrame({m: df[names[k]] for m, df in zip(MODELS, data)})
obj = sns.lineplot(data=frames, markers=True, ax=ax)
obj.set_title(prop)

plt.savefig("MSE.png")


def transpose_data(data):
names = data[NSAMPLES[0]].keys()
results = {n: {} for n in names}
for n, xs in data.items():
number = MAP_NAMES[n]
for name, val in xs.items():
results[name][number] = val

return results


def main():
# read_data()
plot_data(MODELS[0])


if __name__ == "__main__":
main()
51 changes: 51 additions & 0 deletions scripts/remove_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

import argparse
import json

import pandas as pd
from pathlib import Path
from typing import List
from itertools import chain


def read_geometries(file_name: Path) -> List[str]:
"""Read the geometries from ``file_name``."""
with open(file_name, 'r') as f:
gs = json.load(f)

return gs


def remove_duplicates(folders: List[str]):
"""Remove all duplicate smiles from the given folders."""
paths = [Path(f) for f in folders]
smiles = pd.concat([pd.read_csv(next(p.glob("*csv")), index_col=0) for p in paths])
smiles.reset_index(drop=True, inplace=True)
gs = list(chain(*[read_geometries(next(p.glob("*json"))) for p in paths]))
geometries = {k: v for k, v in enumerate(gs)}

# remove duplicate
new_smiles = smiles.drop_duplicates(subset=["smiles"])
indices = new_smiles.index.to_list()
new_smiles.reset_index(drop=True, inplace=True)
new_geometries = [geometries[i] for i in indices]

new_smiles.to_csv("all_carboxylics.csv")
with open("all_geometries_carboxylics.json", 'w') as f:
json.dump(new_geometries, f)

print(new_geometries[-1])
print(new_smiles.iloc[-1])


def main():
parser = argparse.ArgumentParser()
# configure logger
parser.add_argument("-f", "--folders", help="folders to look for the data", nargs="+")
args = parser.parse_args()
remove_duplicates(args.folders)


if __name__ == "__main__":
main()

0 comments on commit 6cb80e6

Please sign in to comment.