# Recreating the best RF model developed for the ARN Categories

In [1]:
from pathlib import Path

In [2]:
TOP = Path.cwd().as_posix().replace('notebooks','')

In [3]:
raw_dir = Path(TOP) / 'data' /'raw'
model_dir = Path(TOP) / 'arn_cats' /'data'

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import textwrap

In [5]:
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add it to sys.path
sys.path.insert(0, project_root)


In [6]:
from arn_cats.utl import logger
log = logger.get_logger(__name__)

In [7]:
from arn_cats.chm.cheminfo_toolkit import Molecule,Fingerprint_engine

  .applymap('{:,.3f}'.format)


In [8]:
from arn_cats.model.build_model import group_predictor_rf, group_predictor_kn, build_random_forest_classifier, select_groups

In [9]:
from rdkit import Chem
import pickle
import numpy as np
import pandas as pd
from glob import glob
import textwrap

In [10]:
from arn_cats.chm import cheminfo_toolkit

Load the set of molecules from the ARN groups themselves

In [11]:
from arn_cats.data.data_load import arn_groupings, molecules

Resolved path: /home/grace/Documents/python/arn_cats/arn_cats/data/molecules_all.pickle


In [12]:
from arn_cats.model.build_model import select_groups, split_molecules_train_test
from arn_cats.model.build_model import build_random_forest_classifier, group_predictor_rf


In [13]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [19]:
molecules_regrouped = select_groups(molecules,
                                    minimum_group_size=10,
                                    small_groups_as_negative=True,
                                    pulled_small_group_name="miscellaneous chemistry")



In [20]:
len(molecules_regrouped)

1540

In [21]:
#split_molecules_train_test(molecules_regrouped, random_state = 0, stratify = True, test_size =0.2)

In [22]:
molecules_train, molecules_test = split_molecules_train_test(molecules_regrouped, random_state=0, test_size=0.2, stratify=True) # this shuffles the data points by default


the training set has 1232 molecules, the test set has 308 molecules


In [23]:
fingerprint_engine = Fingerprint_engine.Morgan(radius=2, nBits=2560)
parameters = {'rf__n_estimators': [150],
              'rf__max_features': [0.01],
              'rf__min_samples_split': [3]}
parameters = None
model_details = build_random_forest_classifier(molecules_regrouped, fingerprint_engine, random_state=0, parameters=parameters)


Fitting 5 folds for each of 270 candidates, totalling 1350 fits
[CV 3/5; 1/270] START rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=50
[CV 3/5; 1/270] END rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=50;, score=0.789 total time=   0.5s
[CV 2/5; 2/270] START rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=100
[CV 2/5; 2/270] END rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=100;, score=0.767 total time=   0.9s
[CV 3/5; 3/270] START rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=150
[CV 3/5; 3/270] END rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=150;, score=0.825 total time=   1.2s
[CV 3/5; 4/270] START rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=200
[CV 3/5; 4/270] END rf__max_features=0.001, rf__min_samples_split=2, rf__n_estimators=200;, score=0.837 total time=   1.5s
[CV 2/5; 5/270] START rf__max_features=0.001, rf__min_samples_split=2, rf__n_e

In [25]:
model_details.keys()

dict_keys(['best mean cross-validation score', 'best estimator', 'grid search results', 'fingerprint engine'])

In [29]:
with open(model_dir/'best_model_rf.pickle', 'wb') as handle:
    pickle.dump(model_details, handle, protocol=pickle.HIGHEST_PROTOCOL)