# Goal 
- Establish baseline model

In [31]:
# Setup
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import utils

import dtreeviz

from matplotlib import pyplot as plt
from IPython import display

In [2]:
# load dataset
df = pd.read_csv("../data/complete-dataset-20231215.csv").drop(
    columns=["Unnamed: 0"], errors="ignore"
)

In [3]:
df.head()

Unnamed: 0,ID,TYPE,TABLE,NAME,FILEFORMAT,STUDY,ASSAY,DATATYPE,DATASUBTYPE,RESOURCETYPE
0,syn2426151,file,SAGE.PORTAL_RAW.AD,chr1.chop.dosage.gz,txt,ROSMAP,SNParray,genomicVariants,processed,analysis
1,syn2426152,file,SAGE.PORTAL_RAW.AD,chr2.chop.dosage.gz,txt,ROSMAP,SNParray,genomicVariants,processed,analysis
2,syn2426153,file,SAGE.PORTAL_RAW.AD,chr4.chop.dosage.gz,txt,ROSMAP,SNParray,genomicVariants,processed,analysis
3,syn2426154,file,SAGE.PORTAL_RAW.AD,chr8.chop.dosage.gz,txt,ROSMAP,SNParray,genomicVariants,processed,analysis
4,syn2426155,file,SAGE.PORTAL_RAW.AD,chr9.chop.dosage.gz,txt,ROSMAP,SNParray,genomicVariants,processed,analysis


In [4]:
# Look at relationship between dataType and datasubtype

In [5]:
df[['RESOURCETYPE', 'FILEFORMAT']].drop_duplicates().sort_values(by = 'RESOURCETYPE')

Unnamed: 0,RESOURCETYPE,FILEFORMAT
0,analysis,txt
73575,analysis,idx
58850,analysis,rdata
44063,analysis,raw
146262,analysis,gzip
35094,analysis,vcf
34954,analysis,ppt
34820,analysis,excel
34798,analysis,zip
34794,analysis,pdf


In [6]:
# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

Found TensorFlow Decision Forests v1.8.1


In [7]:
def labeler(dataset, class_label):
    # Encode the categorical labels as integers.
    #
    # Details:
    # This stage is necessary if your classification label is represented as a
    # string since Keras expects integer classification labels.
    # When using `pd_dataframe_to_tf_dataset` (see below), this step can be skipped.

    # Name of the label column.
    classes = dataset[class_label].unique().tolist()
    print(f"Label classes: {classes}")

    dataset["label"] = dataset[class_label].map(classes.index)

    return dataset

In [47]:
class_label = "RESOURCETYPE"
df = labeler(df, class_label)

Label classes: ['analysis', 'experimentalData', 'metadata']


In [48]:
classes = df[class_label].unique().tolist()

In [9]:
train, val, test = utils.create_datasets(df, 0.6, 0.2)

In [13]:
train.shape

(89283, 11)

In [16]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=label_col)
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val, label=label_col)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test, label=label_col)

In [43]:
# Specify the model.
model_1 = tfdf.keras.RandomForestModel(verbose=2)

# Train the model.
model_1.fit(train_ds)

Use 8 thread(s) for training
Use /var/folders/p0/5m4pdsm55jn_d5nzbjv6bjf40000gq/T/tmp6r5pw5qn as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'ID': <tf.Tensor 'data:0' shape=(None,) dtype=string>, 'TYPE': <tf.Tensor 'data_1:0' shape=(None,) dtype=string>, 'TABLE': <tf.Tensor 'data_2:0' shape=(None,) dtype=string>, 'NAME': <tf.Tensor 'data_3:0' shape=(None,) dtype=string>, 'FILEFORMAT': <tf.Tensor 'data_4:0' shape=(None,) dtype=string>, 'STUDY': <tf.Tensor 'data_5:0' shape=(None,) dtype=string>, 'ASSAY': <tf.Tensor 'data_6:0' shape=(None,) dtype=string>, 'DATATYPE': <tf.Tensor 'data_7:0' shape=(None,) dtype=string>, 'DATASUBTYPE': <tf.Tensor 'data_8:0' shape=(None,) dtype=string>, 'label': <tf.Tensor 'data_9:0' shape=(None,) dtype=int64>}
Label: Tensor("data_10:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'ID': SemanticTensor(semantic=<Semantic.CATEGORICAL: 2>, tensor=<tf.Tensor 'data:0' shape=(None,) dtype=

[INFO 23-12-15 15:59:13.3307 PST kernel.cc:771] Start Yggdrasil model training
[INFO 23-12-15 15:59:13.3308 PST kernel.cc:772] Collect training examples
[INFO 23-12-15 15:59:13.3309 PST kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-12-15 15:59:13.3310 PST kernel.cc:391] Number of batches: 90
[INFO 23-12-15 15:59:13.3310 PST kernel.cc:392] Number of examples: 89283
[INFO 23-12-15 15:59:13.3651 PST data_spec_inference.cc:305] 26 item(s) have been pruned (i.e. they are considered out of dictionary) for the column ASSAY (27 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO 23-12-15 15:59:13.3652 PST data_spec_infe

Model trained in 0:00:00.987342
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x2931f5b10>

In [44]:
model_1.compile(metrics=["accuracy"])
evaluation = model_1.evaluate(test_ds, return_dict=True)
print()

for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")


loss: 0.0000
accuracy: 1.0000


In [65]:
# Tell dtreeviz about training data and model
features = [f.name for f in model_1.make_inspector().features()]
viz_model_1 = dtreeviz.model(
    model_1,
    tree_index=3,
    X_train=train[features],
    y_train=train[label_col],
    feature_names=features,
    target_name=label_col,
    class_names=classes,
)

In [None]:
# model_1.save("/tmp/my_saved_model")

In [74]:
from bs4 import BeautifulSoup

In [77]:
with open('./tree.html', 'w', encoding = 'utf-8') as f: 
    f.write(
        str(
            BeautifulSoup(
                tfdf.model_plotter.plot_model(model_1, tree_idx=0, max_depth=3),
                "html.parser").prettify()))