# Evaluation of Multimodal Encodings

In this notebook, we attempt to find the best performing multimodal encoding

In [1]:
import sys; sys.path.append('../')

from text_bit.encodings.umbrella import UmbrellaEncoding as UE
from text_bit.encodings.nat_number import NatNumberEncoding as NNE
from text_bit.multimodal_encoding import MultimodalEncoding
from text_bit.dataset import read_datasets


In [2]:
datasets = read_datasets("../datasets/")

# Baseline
me_umbrella = MultimodalEncoding("baseline", UE)

# Multimodal encoding with two encoding, nats and umbrella
me_all = MultimodalEncoding("all", [NNE, UE])

ENCODINGS = [me_all, me_umbrella]

In [3]:
import plotly.express as px
import pandas as pd

def all_strings():
   return ((ds.name, s) for ds in datasets.values() for s in ds)
def all_strings_shorter_than(length):
   return ((ds_name, s) for (ds_name, s) in all_strings() if len(s) < length)
data_points = ((me.name, ds_name, len(s), len(me.text_to_bit(s))) 
                for me in ENCODINGS 
                for (ds_name, s) in all_strings_shorter_than(250))

df = pd.DataFrame(data_points, columns=['Encoding', 'Dataset Name', 'String Length', 'Bits'])

fig = px.scatter(df, x="String Length", y="Bits", color="Encoding", title="String length vs bits, Entire Dataset", opacity=0.4)

fig.show()