# Language Model: flat data, without context

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mostly-ai/mostlyai-engine/blob/main/examples/language.ipynb)

In [11]:
from pathlib import Path
import pandas as pd
from mostlyai import engine

# init workspace and logging
# ws = Path("ws-language-flat")
ws = Path("ws-language-categorical-flat")
engine.init_logging()

# # load original data
url = "https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/arxiv"
trn_df = pd.read_parquet(f"{url}/synthetic-data-papers.parquet")[['category', 'title']]
# trn_df = pd.read_parquet(f"{url}/synthetic-data-papers.parquet")[['category']]

# execute the engine steps
engine.split(                         # split data as PQT files for `trn` + `val` to `{ws}/OriginalData/tgt-data`
    workspace_dir=ws,
    tgt_data=trn_df,
    # model_type="LANGUAGE",
    tgt_encoding_types={"category": "LANGUAGE_CATEGORICAL", "title": "LANGUAGE_TEXT"},
)
engine.analyze(workspace_dir=ws)      # generate column-level statistics to `{ws}/ModelStore/tgt-stats/stats.json`
engine.encode(workspace_dir=ws)       # encode training data to `{ws}/OriginalData/encoded-data`
engine.train(                         # train model and store to `{ws}/ModelStore/model-data`
    workspace_dir=ws,
    model="MOSTLY_AI/LSTMFromScratch-3m",  # use a light-weight LSTM model, trained from scratch (GPU recommended)
    # model="microsoft/phi-1.5",           # or alternatively use a HF-hosted LLM model (GPU required)
    max_training_time=1,                   # limit TRAIN to 10 minute for demo purposes
)
engine.generate(                      # use model to generate synthetic samples to `{ws}/SyntheticData`
    workspace_dir=ws, 
    sample_size=10000,
)

[2025-02-05 14:52:45,343] INFO   : SPLIT started
[2025-02-05 14:52:45,344] INFO   : clean `ws-language-categorical-flat/OriginalData/tgt-data`
[2025-02-05 14:52:45,345] INFO   : clean `ws-language-categorical-flat/OriginalData/tgt-meta`
[2025-02-05 14:52:45,346] INFO   : model_type='LANGUAGE'
[2025-02-05 14:52:45,346] INFO   : tgt_encoding_types={'category': 'LANGUAGE_CATEGORICAL', 'title': 'LANGUAGE_TEXT'}
[2025-02-05 14:52:45,360] INFO   : SPLIT finished in 0.02s
[2025-02-05 14:52:45,361] INFO   : ANALYZE started
[2025-02-05 14:52:45,363] INFO   : clean `ws-language-categorical-flat/ModelStore/tgt-stats`
[2025-02-05 14:52:45,364] INFO   : analyzing 2 partitions in parallel
[2025-02-05 14:52:45,413] INFO   : analyzed target partition 000000-trn (20768, 2)
[2025-02-05 14:52:45,422] INFO   : analyzed target partition 000000-val (2308, 2)
[2025-02-05 14:52:45,422] INFO   : combine partition statistics
[2025-02-05 14:52:45,423] INFO   : analyzed column `category`: LANGUAGE_CATEGORICAL 
[2



[2025-02-05 14:56:20,197] INFO   : num_samples_max_length_limit=0
[2025-02-05 14:56:20,241] INFO   : percentage of invalid values: {'category': '0.00%', 'title': '0.00%'}
[2025-02-05 14:56:20,241] INFO   : decoded (10000, 2) from 79 batches in 1.30s
[2025-02-05 14:56:20,244] INFO   : persisted (10000, 2) to `part.000000.000000.parquet` in 0.00s
[2025-02-05 14:56:20,246] INFO   : total_tokenize_fn_time=0.21s
[2025-02-05 14:56:20,246] INFO   : total_logits_processor_build_time=1.37s
[2025-02-05 14:56:20,246] INFO   : total_generate_fn_time=139.43s
[2025-02-05 14:56:20,246] INFO   : GENERATE_LANGUAGE finished in 142.60s


In [12]:
syn_tgt_df = pd.read_parquet(ws / "SyntheticData") # load synthetic data
set(trn_df['category']) - set(syn_tgt_df['category']) 

{'chao-dyn',
 'cmp-lg',
 'comp-gas',
 'cond-mat.other',
 'cond-mat.quant-gas',
 'cond-mat.supr-con',
 'cs.CC',
 'cs.DL',
 'cs.FL',
 'cs.OS',
 'cs.PL',
 'cs.SC',
 'econ.TH',
 'math.CA',
 'math.CT',
 'math.DG',
 'math.FA',
 'math.GM',
 'math.GN',
 'math.GR',
 'math.MG',
 'math.SP',
 'nlin.AO',
 'nucl-ex',
 'nucl-th',
 'q-bio.CB',
 'q-bio.OT',
 'q-bio.SC',
 'q-fin.EC',
 'q-fin.MF',
 'q-fin.PR'}

In [13]:
set(syn_tgt_df['category']) - set(trn_df['category'])

{'_RARE_'}

In [14]:
syn_tgt_df['title'].head(10)

0                                             , 
1                                       category
2                                      : A- for 
3                                             : 
4                                          ,  to
5                                          -..ML
6                                          D the
7    -Oed Learning with-to for-c- and Data ofe
 
8          S from: a Learning ofn- for Synthetic
9                                               
Name: title, dtype: string

In [15]:
trn_df['title'].head(10)

0          Conception d'un banc d'essais d\'ecisionnel
1         Monotonicity Analysis over Chains and Curves
2    An active curve approach for tomographic recon...
3    Application of the HLSVD technique to the filt...
4              Phase retrieval by iterated projections
5               DIRC for a Higher Luminosity B Factory
6    Analysis of approximate nearest neighbor searc...
7    Efficient Retrieval of Similar Time Sequences ...
8    Mining Generalized Graph Patterns based on Use...
9    ARACNE: An Algorithm for the Reconstruction of...
Name: title, dtype: object

In [16]:
syn_tgt_df

Unnamed: 0,category,title
0,cs.CV,","
1,cs.CY,category
2,stat.ML,: A- for
3,cs.CV,:
4,cs.LG,", to"
...,...,...
9995,cs.LG,category
9996,stat.ME,:
9997,cs.LG,D
9998,cs.CL,:
