# Language Model: flat data, without context

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mostly-ai/mostlyai-engine/blob/main/examples/language.ipynb)

In [None]:
import logging
import sys
import numpy as np

logging.basicConfig(
    level=logging.INFO,
    stream=sys.stdout,
    format="[%(asctime)s] %(levelname)-7s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

In [None]:
dataset_name = "sacred"
ctx_encoding_types = {"book": "TABULAR_CATEGORICAL"}
tgt_encoding_types = {"text": "LANGUAGE_TEXT"}
fn = "https://github.com/mostly-ai/public-demo-data/raw/dev/sacred_verses/sacred.csv.gz"
df = pd.read_csv(fn)[list(ctx_encoding_types.keys()) + list(tgt_encoding_types.keys())]
df.text = df["text"].str[:30]  # trim to 30 chars max to speed up demo
print(df.shape)
df.iloc[0]

In [None]:
ws = Path("language-ws")

pk = "pk"
df[pk] = list(range(df.shape[0]))
ctx_columns = [pk, *[key for key in ctx_encoding_types.keys()]] if ctx_encoding_types else [pk]
tgt_columns = [pk, *[key for key in tgt_encoding_types.keys()]]
ctx_df = df[ctx_columns]
tgt_df = df[tgt_columns]

split(
    tgt_data=tgt_df,
    tgt_context_key=pk,
    tgt_encoding_types=tgt_encoding_types,
    ctx_data=ctx_df,
    ctx_primary_key=pk,
    ctx_encoding_types=ctx_encoding_types,
    workspace_dir=ws,
)
analyze(workspace_dir=ws)
encode(workspace_dir=ws)

In [None]:
encoded_data = pd.read_parquet(ws / "OriginalData" / "encoded-data")
encoded_data.head()

In [None]:
# train(workspace_dir=workspace_dir, max_training_time=2, model="Locutusque/TinyMistral-248M")
# train(workspace_dir=workspace_dir, max_training_time=2, model="EleutherAI/pythia-160m")
# train(workspace_dir=workspace_dir, max_training_time=2, model="EleutherAI/pythia-410m")
train(workspace_dir=ws, max_training_time=2, model="MOSTLY_AI/LSTMFromScratch-3m")

In [None]:
generate(sample_size=100, workspace_dir=ws)

In [None]:
from pathlib import Path
import pandas as pd
from mostlyai import engine

# set up workspace
ws = Path("ws-language-flat")

# load original data
url = "https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/arxiv"
trn_df = pd.read_parquet(f"{url}/synthetic-data-papers.parquet")[['category', 'title']]

# execute the engine steps
engine.split(                         # split data as PQT files for `trn` + `val` to `{ws}/OriginalData/tgt-data`
    workspace_dir=ws,
    tgt_data=trn_df,
    model_type="LANGUAGE",
)
engine.analyze(workspace_dir=ws)      # generate column-level statistics to `{ws}/ModelStore/tgt-stats/stats.json`
engine.encode(workspace_dir=ws)       # encode training data to `{ws}/OriginalData/encoded-data`
engine.train(                         # train model and store to `{ws}/ModelStore/model-data`
    workspace_dir=ws,
    model="MOSTLY_AI/LSTMFromScratch-3m",  # use a light-weight LSTM model, trained from scratch (GPU recommended)
    # model="microsoft/phi-1.5",           # or alternatively use a HF-hosted LLM model (GPU required)
    max_training_time=10,                  # limit TRAIN to 10 minute for demo purposes
)
engine.generate(                      # use model to generate synthetic samples to `{ws}/SyntheticData`
    workspace_dir=ws, 
    sample_size=100,
)

In [None]:
syn_tgt_df = pd.read_parquet(ws / "SyntheticData") # load synthetic data
syn_tgt_df.head(5)