# 1. Import Rxn-INSIGHT

In [None]:
import pandas as pd
import numpy as np
from rxnmapper import RXNMapper

from rxn_insight.reaction import Reaction
from rxn_insight.utils import draw_chemical_reaction, curate_smirks, get_similarity, get_fp
from IPython.display import SVG, display
import time

# 2. Load sample dataset

In [None]:
df_uspto = pd.read_parquet("../data/example.gzip")

# 3. Evaluate single reaction

In [None]:
rxn_mapper = RXNMapper()

In [None]:
r = df_uspto["REACTION"][df_uspto.index[0]]
solvent = df_uspto["SOLVENT"][df_uspto.index[0]]
reagent = df_uspto["REAGENT"][df_uspto.index[0]]
catalyst = df_uspto["CATALYST"][df_uspto.index[0]]
ref = df_uspto["REF"][df_uspto.index[0]]

In [None]:
display(SVG(draw_chemical_reaction(r)))

In [None]:
rxn = Reaction(r, solvent=solvent, reagent=reagent, catalyst=catalyst, ref=ref, rxn_mapper=rxn_mapper)

Get all information at once by running `get_reaction_info()`

In [None]:
rxn.get_reaction_info()

In [None]:
rxn = '[CH3:1][O:2][C:3](=[O:4])[CH:5]=[CH2:6].I[c:7]1[cH:8][cH:9][cH:10][cH:11][cH:12]1>>[CH3:1][O:2][C:3](=[O:4])/[CH:5]=[CH:6]/[c:7]1[cH:8][cH:9][cH:10][cH:11][cH:12]1'

In [None]:
rxn2 = Reaction(rxn, keep_mapping=True)
rxn2.get_reaction_info()

# 4. Evaluate multiple reactions

In [None]:
import time

In [None]:
rxn_mapper = RXNMapper()
smirks = pd.read_json("../src/rxn_insight/data/smirks.json", orient='records', lines=True)
smirks = curate_smirks(smirks)
fg = pd.read_json("../src/rxn_insight/data/functional_groups.json", orient='records', lines=True)

In [None]:
ids = np.random.choice(100, 10, replace=False)

In [None]:
for i in ids:
    start_t = time.time()
    r = df_uspto["REACTION"][df_uspto.index[i]]
    solvent = df_uspto["SOLVENT"][df_uspto.index[i]]
    reagent = df_uspto["REAGENT"][df_uspto.index[i]]
    catalyst = df_uspto["CATALYST"][df_uspto.index[i]]
    ref = df_uspto["REF"][df_uspto.index[i]]
    rxn = Reaction(r, solvent=solvent, reagent=reagent, catalyst=catalyst, ref=ref, rxn_mapper=rxn_mapper, smirks=smirks, fg=fg)
    ri = rxn.get_reaction_info()
    end_t = time.time()
    dt = end_t - start_t
    print(f"{r}\nClass: {rxn.reaction_class}\nName: {rxn.name}\nTime: {dt:.2f} seconds\n====\n")

# 5. Find similar reactions

In [None]:
# df_analyzed = pd.read_parquet("data/uspto.gzip")
df_analyzed = pd.read_parquet("../data/1000rxns.gzip")

In [None]:
df_analyzed.keys()

In [None]:
df_uspto = df_analyzed.sample(1000)

In [None]:
r = "N#Cc1ccc(C=O)cc1.Nc1ccccc1S>>N#Cc1ccc(-c2nc3ccccc3s2)cc1"
r = "BrCCBr.COC(=O)c1cccc(O)c1>>COC(=O)c1cccc(OCCBr)c1"

In [None]:
display(SVG(draw_chemical_reaction(r)))

In [None]:
rxn = Reaction(r)

In [None]:
df_analyzed

In [None]:
rxn.get_reaction_info()

In [None]:
st = time.time()
df_nbs = rxn.find_neighbors(df_analyzed, fp="MACCS", concatenate=True, threshold=0.0, broaden=True, full_search=False)
print(f"{time.time() - st:.2f} seconds")

In [None]:
get_similarity(get_fp(r, "MACCS", True), get_fp(df_nbs["REACTION"][1043625], "MACCS", True))

In [None]:
get_similarity(get_fp(r, "MACCS", False), get_fp(df_nbs["REACTION"][1043625], "MACCS", False))

In [None]:
get_similarity(get_fp(r, "Morgan", True), get_fp(df_nbs["REACTION"][1043625], "Morgan", True))

In [None]:
get_similarity(get_fp(r, "Morgan", False), get_fp(df_nbs["REACTION"][1043625], "Morgan", False))

In [None]:
display(SVG(draw_chemical_reaction(df_nbs["REACTION"][1043625])))

In [None]:
display(SVG(draw_chemical_reaction(df_nbs["REACTION"][df_nbs.index[2]])))

# 6. Suggest conditions

In [None]:
rxn.suggest_conditions(df_analyzed)

## 6.1 Show ranking of reagents

In [None]:
rxn.suggested_solvent

In [None]:
rxn.suggested_catalyst

In [None]:
rxn.suggested_reagent

# 7. Extract all scaffolds

In [None]:
df_uspto = pd.read_parquet("data/example.gzip")
df_uspto.head()

## 7.1 Reading in the reaction (slow)

In [None]:
start_t = time.time()
rxn_mapper = RXNMapper()
scaffolds = []
for i in tqdm(df_uspto.index):
    try:
        r = df_uspto["REACTION"][i]
        rxn = Reaction(r, rxn_mapper=rxn_mapper)
        scaffold = rxn.get_scaffold()
        scaffolds.append(scaffold)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        print(e)
        continue
end_t = time.time()
dt = end_t - start_t
print(f"There are {len(list(set(scaffolds)))} unique scaffolds on a total of {len(scaffolds)} scaffolds, which we found in {dt:.1f} seconds.")

## 7.2 Reading in the molecule (fast)

In [None]:
start_t = time.time()
scaffolds = []
for i in tqdm(df_uspto.index):
    try:
        r = df_uspto["REACTION"][i]
        smi = r.split(">>")[1]
        m = Molecule(smi)
        scaffold = m.scaffold
        scaffolds.append(scaffold)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        print(e)
        continue
end_t = time.time()
dt = end_t - start_t
print(f"There are {len(list(set(scaffolds)))} unique scaffolds on a total of {len(scaffolds)} scaffolds, which we found in {dt:.1f} seconds.")

# 8. Search reactions by molecule

In [None]:
loperamide = "ClC1=CC=C(C2(CCN(CC2)CCC(C3=CC=CC=C3)(C(N(C)C)=O)C4=CC=CC=C4)O)C=C1"

In [None]:
m = Molecule(loperamide)

In [None]:
m.smiles

In [None]:
# Search for the exact compound
m.search_reactions(df_analyzed)

In [None]:
# Search for the exact scaffold
m.search_reactions_by_scaffold(df_analyzed)