# MOSTLY AI

## Installation

Use `pip` (or preferably `uv pip`) to install the official `MostlyAI` package via PyPI. Python 3.10 or higher is required.  

We use `mostlyai[local]` to install the LOCAL mode, which trains and generates synthetic data locally on our own compute resources.  

After installing, you might need to restart your kernel

In [2]:
# %pip install 'mostlyai[local]'

## Load Original Data

Fetch some original data that will be used for training the generator.

In [1]:
import pandas as pd

In [2]:
positions_df = pd.read_csv("input_data/denmark_positions_downsampled.csv")
trips_df = pd.read_csv("input_data/denmark_trips_downsampled.csv")

In [3]:
trips_df = trips_df[['TRIP_ID']]
trips_df

Unnamed: 0,TRIP_ID
0,8
1,9
2,10
3,11
4,24
...,...
4399,4996
4400,4997
4401,4998
4402,4999


In [4]:
positions_df = positions_df[['POSITION_ID', 'POSITION_TRIP_ID', 'LAT_LONG']]
positions_df

Unnamed: 0,POSITION_ID,POSITION_TRIP_ID,LAT_LONG
0,0,8,"55.508922, 15.458938"
1,3,8,"55.500105, 15.452538"
2,9,8,"55.492128, 15.444578"
3,14,8,"55.482085, 15.43882"
4,23,8,"55.471972, 15.432787"
...,...,...,...
464300,8533070,5000,"54.534398, 13.936615"
464301,8533086,5000,"54.542833, 13.9285"
464302,8533096,5000,"54.550905, 13.92056"
464303,8533103,5000,"54.564587, 13.906743"


## Initialize the SDK



In [5]:
from mostlyai.sdk import MostlyAI

# initialize SDK
mostly = MostlyAI(local=True)

## Train a Generator

Train a synthetic data generator.

In [6]:
g = mostly.train(
    config={
        "name": "Vessel Trip Generator",
        "tables": [
            {
                "name": "trips",
                "data": trips_df,
                "tabular_model_configuration": {
                    # "max_training_time": 1,  # - limit training time (in minutes)
                    "model": "MOSTLY_AI/Large",
                },
                "primary_key": "TRIP_ID",
                "columns": [{"name": "TRIP_ID"}],
            },
            {
                "name": "positions",
                "data": positions_df,
                "tabular_model_configuration": {
                    # "max_training_time": 1,  # - limit training time (in minutes)
                    "model": "MOSTLY_AI/Large"
                },
                "columns": [
                    {"name": "POSITION_ID"},
                    {"name": "POSITION_TRIP_ID"},
                    {
                        "name": "LAT_LONG",
                        "model_encoding_type": "TABULAR_LAT_LONG",
                    },
                ],
                "primary_key": "POSITION_ID",
                "foreign_keys": [
                    {
                        "column": "POSITION_TRIP_ID",
                        "referenced_table": "trips",
                        "is_context": True,
                    }
                ],
            },
        ],
    },
    start=True,
    wait=True
)


Output()











### Export generator to file

You can export a generator to a ZIP archive with the `export_to_file()` method. The generator is exported as a ZIP archive.

In [None]:
# g.export_to_file('generator/1903-denmark-generator.zip')

## Generate Synthetic Data

Probe the trained generator for 100 representative synthetic samples.

In [7]:
df_samples = mostly.probe(g, size=1000)
df_samples

{'trips':                                   TRIP_ID
 0    mostly5d-6d29-4902-ba95-9c9895378da9
 1    mostly89-cbef-4472-801a-22c5368e70a4
 2    mostlye9-270a-4acc-a69f-6cdad9136f7a
 3    mostly7c-84e9-4d56-9269-1ccb0fcaeb65
 4    mostlyff-2249-463b-b280-aba831543d58
 ..                                    ...
 995  mostly8d-1cbd-45e3-aa65-53b0436932c5
 996  mostly62-0f88-48c4-b2ec-de3652d22720
 997  mostlye2-1e31-4556-a0a7-a692adcf7c0d
 998  mostlye6-e67c-49b1-8fcd-b3598e4c0f74
 999  mostlyc7-ed2a-4fd2-81c2-1d9bfe972889
 
 [1000 rows x 1 columns],
 'positions':                                  POSITION_ID  \
 0       mostlyde-9d6b-4eb9-bbd0-544d54eb8c9a   
 1       mostly91-57da-4718-9535-12ded7968acd   
 2       mostlyb6-3a89-44e9-a68b-efcfcef2fdc7   
 3       mostly42-12f8-4281-83d2-f66eda5d6766   
 4       mostly7f-816c-4b7e-b934-daac03c3909d   
 ...                                      ...   
 105180  mostly56-b81c-4425-9a6d-0dac50b15267   
 105181  mostly5a-6db5-4edd-816a-5483f8ae5

In [8]:
syn_positions = df_samples["positions"]
syn_positions

Unnamed: 0,POSITION_ID,POSITION_TRIP_ID,LAT_LONG
0,mostlyde-9d6b-4eb9-bbd0-544d54eb8c9a,mostlyd8-7b96-469b-9321-2d1d5fbb2ba8,"55.35888, 13.15236"
1,mostly91-57da-4718-9535-12ded7968acd,mostlydc-cf7b-4d3c-bea5-a88623c92c67,"55.37030, 13.14911"
2,mostlyb6-3a89-44e9-a68b-efcfcef2fdc7,mostlye5-ebe3-4194-9b7a-9fc68b101a2a,"55.67092, 15.98927"
3,mostly42-12f8-4281-83d2-f66eda5d6766,mostlyf1-664e-4805-a078-33a00565c02b,"54.82278, 13.57664"
4,mostly7f-816c-4b7e-b934-daac03c3909d,mostlyb5-d233-4186-8542-aacfa801800e,"54.87729, 13.52614"
...,...,...,...
105180,mostly56-b81c-4425-9a6d-0dac50b15267,mostly9b-96cb-4cc3-a4fe-6a9bff241827,"54.85081, 13.01383"
105181,mostly5a-6db5-4edd-816a-5483f8ae58be,mostlybf-504e-48bc-977e-6e0ff757dda6,"54.51365, 14.17561"
105182,mostlye3-2fb5-43ce-9236-e2425581ccf0,mostlyee-661c-43db-95d3-a08bf36c8cb4,"54.78991, 13.10261"
105183,mostly5f-94ef-42a3-8d95-7ccacb7ba4d7,mostly18-1933-485d-8ec3-3f7be1a9ae32,"54.85968, 13.01165"


In [9]:
ps = pd.DataFrame(syn_positions)
ps.to_csv("synthetic_data/denmark_syn_positions.csv")

In [None]:
# Splitting into latitude and longitude
ps[['LATITUDE', 'LONGITUDE']] = ps['LAT_LONG'].str.split(', ', expand=True)

# Convert to float
ps['LATITUDE'] = ps['LATITUDE'].astype(float)
ps['LONGITUDE'] = ps['LONGITUDE'].astype(float)

ps.to_csv("synthetic_data/denmark_syn_positions.csv")
ps

# Save and Load existing generator

In [1]:
from mostlyai.sdk import MostlyAI

# initialize SDK
mostly = MostlyAI(local=True)

# Save and Load generators

You can import a generator with the `import_from_file()` method.

In [None]:
# g = mostly.generators.import_from_file('generator/1903-denmark-generator.zip')