Merge pull request #84 from nlesc-nano/dev

miscellaneus scripts
nlesc-nano · May 19, 2021 · 6cb80e6 · 6cb80e6
2 parents 6ed5abd + 12855f3
commit 6cb80e6
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 21 deletions.
diff --git a/scripts/extract_results.py b/scripts/extract_results.py
@@ -12,13 +12,14 @@
 import pandas as pd
 
 MAP_NAMES = {"11k": 11043, "1k": 1000, "2k": 2000, "500": 500, "5k": 5000, "7k": 7500}
+MODELS = ("Fingerprints", "MPNN", "SE3Transformer")
 
 
 def read_info(file_name: Path, column: str = "rvalue") -> np.ndarray:
     """Read the rvalues from the table."""
     with open(file_name, 'r') as f:
         data = f.readlines()
-    
+
     # Indices of the columns with the regression info
     names = {"slope": 0, "intercept": 1, "rvalue": 2, "stderr": 3}
     index = names[column]
@@ -53,11 +54,10 @@ def extract_subfolder(folder: Path, nfiles: int, nsamples: int) -> pd.DataFrame:
 def extract_all_results(root: Path) -> None:
     """Extract all the rvalues from all the subfolders."""
     folders = [p for p in root.iterdir() if p.is_dir()]
-    models = ("Fingerprints", "MPNN", "SE3Transformer")
     results = {}
     for directory in folders:
         nsamples = MAP_NAMES[directory.name]
-        for model in models:
+        for model in MODELS:
             workdir = directory / model / "Results"
             nresults = count_results(workdir)
             df = extract_subfolder(workdir, nresults, nsamples)
@@ -69,33 +69,23 @@ def extract_all_results(root: Path) -> None:
 
     new_results = process_data(results)
     plot_results_all_models(new_results)
-    for name, df in new_results.items():
-        plot_results_for_model(name, df)
 
 
 def plot_results_all_models(results):
     # Create subplots to accomodate the results
     _, axis = plt.subplots(nrows=5, ncols=3, figsize=(20, 20), constrained_layout=True)
 
-    # merge the models results
-    columns = results["Fingerprints"].columns
-    for k, c in enumerate(columns):
-        data = pd.concat([df[c] for df in results.values()], axis=1)
-        sns.lineplot(data=data, ax=axis[k // 3][k % 3])
+    names = results["Fingerprints"].columns
+    for k, prop in enumerate(names):
+        ax = axis[k // 3][k % 3]
+        frames = pd.DataFrame({m: results[m][names[k]] for m in MODELS})
+        obj = sns.lineplot(data=frames, markers=True, ax=ax)
+        obj.set_title(prop)
+    plt.xlabel("Number of samples")
+    plt.ylabel("rvalue")
     plt.savefig("scaling.png")
 
 
-def plot_results_for_model(name: str, df: pd.DataFrame):
-    # Create subplots to accomodate the results
-    _, axis = plt.subplots(nrows=5, ncols=3, figsize=(20, 20), constrained_layout=True)
-
-    # merge the models results
-    for k, c in enumerate(df.columns):
-        sns.lineplot(data=df[c], ax=axis[k // 3][k % 3])
-    plt.suptitle(f"Scaling for {name}", fontsize=14)        
-    plt.savefig(f"{name}_scaling.png")
-
-
 def process_data(results):
     new = {}
     for k, df in results.items():

diff --git a/scripts/plot_means.py b/scripts/plot_means.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from pandas.core.algorithms import mode
+import seaborn as sns
+import json
+from pathlib import Path
+
+MODELS = ("Fingerprints", "MPNN", "SE3Transformer")
+NSAMPLES = ("11k", "1k", "2k", "500", "5k", "7k")
+MAP_NAMES = {"11k": 11043, "1k": 1000, "2k": 2000, "500": 500, "5k": 5000, "7k": 7500}
+PATH_GROUND_TRUE = Path("data/Carboxylic_acids/CDFT/cdft_random_500.csv")
+MSE_FILE = "MSE.json"
+
+
+def read_data():
+    ground_true = pd.read_csv(PATH_GROUND_TRUE, index_col=0)
+    ground_true.drop("smiles", axis=1, inplace=True)
+    results = {}
+    for m in MODELS:
+        results[m] = {}
+        for n in NSAMPLES:
+            df = pd.read_json(f"means_{m}_{n}.json")
+            new = (df - ground_true) ** 2
+            mse = new.sum() / len(df)
+            results[m][n] = mse.to_dict()
+
+    with open(MSE_FILE, 'w') as f:
+        json.dump(results, f, indent=4)
+
+
+def plot_data(model: str):
+    with open(MSE_FILE, 'r') as f:
+        data = json.load(f)
+
+    data = [pd.DataFrame(transpose_data(data[m])) for m in MODELS]
+    for df in data:
+        df.sort_index(inplace=True)
+
+    names = data[0].columns
+    _, axis = plt.subplots(nrows=5, ncols=3, figsize=(20, 20), constrained_layout=True)
+
+    for k, prop in enumerate(names):
+        ax = axis[k // 3][k % 3]
+        frames = pd.DataFrame({m: df[names[k]] for m, df in zip(MODELS, data)})
+        obj = sns.lineplot(data=frames, markers=True, ax=ax)
+        obj.set_title(prop)
+
+    plt.savefig("MSE.png")
+
+
+def transpose_data(data):
+    names = data[NSAMPLES[0]].keys()
+    results = {n: {} for n in names}
+    for n, xs in data.items():
+        number = MAP_NAMES[n]
+        for name, val in xs.items():
+            results[name][number] = val
+
+    return results
+
+
+def main():
+    # read_data()
+    plot_data(MODELS[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/remove_duplicates.py b/scripts/remove_duplicates.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+
+import pandas as pd
+from pathlib import Path
+from typing import List
+from itertools import chain
+
+
+def read_geometries(file_name: Path) -> List[str]:
+    """Read the geometries from ``file_name``."""
+    with open(file_name, 'r') as f:
+        gs = json.load(f)
+
+    return gs
+
+
+def remove_duplicates(folders: List[str]):
+    """Remove all duplicate smiles from the given folders."""
+    paths = [Path(f) for f in folders]
+    smiles = pd.concat([pd.read_csv(next(p.glob("*csv")), index_col=0) for p in paths])
+    smiles.reset_index(drop=True, inplace=True)
+    gs = list(chain(*[read_geometries(next(p.glob("*json"))) for p in paths]))
+    geometries = {k: v for k, v in enumerate(gs)}
+
+    # remove duplicate
+    new_smiles = smiles.drop_duplicates(subset=["smiles"])
+    indices = new_smiles.index.to_list()
+    new_smiles.reset_index(drop=True, inplace=True)
+    new_geometries = [geometries[i] for i in indices]
+
+    new_smiles.to_csv("all_carboxylics.csv")
+    with open("all_geometries_carboxylics.json", 'w') as f:
+        json.dump(new_geometries, f)
+
+    print(new_geometries[-1])
+    print(new_smiles.iloc[-1])
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # configure logger
+    parser.add_argument("-f", "--folders", help="folders to look for the data", nargs="+")
+    args = parser.parse_args()
+    remove_duplicates(args.folders)
+
+
+if __name__ == "__main__":
+    main()