In [1]:
"""
This is a simple example of how to load an LLM from huggingface to generate text
responses for a simple list of questions and answers dataset.
This example uses the `facebook/opt-350m` model as the base LLM model.
"""

# Import required libraries
import logging
import shutil

import pandas as pd
import yaml

from ludwig.api import LudwigModel

# clean out prior results
shutil.rmtree("./results", ignore_errors=True)

qa_pairs = [
    {"Question": "What is the capital of Uzbekistan?", "Answer": "Tashkent"},
    {"Question": "Who is the founder of Microsoft?", "Answer": "Bill Gates"},
    {"Question": "What is the tallest building in the world?", "Answer": "Burj Khalifa"},
    {"Question": "What is the currency of Brazil?", "Answer": "Real"},
    {"Question": "What is the boiling point of mercury in Celsius?", "Answer": "-38.83"},
    {"Question": "What is the most commonly spoken language in the world?", "Answer": "Mandarin"},
    {"Question": "What is the diameter of the Earth?", "Answer": "12,742 km"},
    {"Question": 'Who wrote the novel "1984"?', "Answer": "George Orwell"},
    {"Question": "What is the name of the largest moon of Neptune?", "Answer": "Triton"},
    {"Question": "What is the speed of light in meters per second?", "Answer": "299,792,458 m/s"},
    {"Question": "What is the smallest country in Africa by land area?", "Answer": "Seychelles"},
    {"Question": "What is the largest organ in the human body?", "Answer": "Skin"},
    {"Question": 'Who directed the film "The Godfather"?', "Answer": "Francis Ford Coppola"},
    {"Question": "What is the name of the smallest planet in our solar system?", "Answer": "Mercury"},
    {"Question": "What is the largest lake in Africa?", "Answer": "Lake Victoria"},
    {"Question": "What is the smallest country in Asia by land area?", "Answer": "Maldives"},
    {"Question": "Who is the current president of Russia?", "Answer": "Vladimir Putin"},
    {"Question": "What is the chemical symbol for gold?", "Answer": "Au"},
    {"Question": "What is the name of the famous Swiss mountain known for skiing?", "Answer": "The Matterhorn"},
    {"Question": "What is the largest flower in the world?", "Answer": "Rafflesia arnoldii"},
]

df = pd.DataFrame(qa_pairs)

config = yaml.safe_load(
    """
        input_features:
            - name: Question
              type: text
        output_features:
            - name: Answer
              type: text
        model_type: llm
        generation:
            temperature: 0.1
            top_p: 0.75
            top_k: 40
            num_beams: 4
            max_new_tokens: 5
        base_model: facebook/opt-350m
    """
)

# Define Ludwig model object that drives model training
model = LudwigModel(config=config, logging_level=logging.INFO)

# initiate model training
(
    train_stats,  # dictionary containing training statistics
    preprocessed_data,  # tuple Ludwig Dataset objects of pre-processed training data
    output_directory,  # location of training results stored on disk
) = model.train(
    dataset=df, experiment_name="simple_experiment", model_name="simple_model", skip_save_processed_input=True
)

training_set, val_set, test_set, _ = preprocessed_data

# batch prediction
preds, _ = model.predict(test_set, skip_save_predictions=False)
print(preds)

ModuleNotFoundError: No module named 'ludwig'

In [3]:
pip install ludwig

Collecting ludwig
  Downloading ludwig-0.10.4.tar.gz (1.1 MB)
     ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
     --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
     ------------------- -------------------- 0.5/1.1 MB 1.7 MB/s eta 0:00:01
     ---------------------------------------- 1.1/1.1 MB 1.6 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting Cython>=0.25 (from ludwig)
  Downloading Cython-3.0.11-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Collecting h5py!=3.0.0,>=2.6 (from ludwig)
  Downloading h5py-3.12.1-cp311-cp311-win_amd64.whl.metadata (2.5 kB)
Collecting torch

  You can safely remove it manually.
ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\mayur\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\transformers\\models\\deprecated\\trajectory_transformer\\convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py'



   ---------------------- --------------- 524.3/897.5 kB 699.0 kB/s eta 0:00:01
   ---------------------------------------- 897.5/897.5 kB 1.1 MB/s eta 0:00:00
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/991.5 kB ? eta -:--:--
   --------------------- ------------------ 524.3/991.5 kB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 1.9 MB/s eta 0:00:00
Downloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   --- ------------------------------------ 0.5/5.5 MB 2.4 MB/s eta 0:00:03
   ------- -------------------------------- 1.0/5.5 MB 2.4 MB/s eta 0:00:02
   --------- ------------------------------ 1.3/5.5 MB 2.3 MB/s eta 0:00:02
   ----------- ---------------------------- 1.6/5.5 MB 2.1 MB/s eta 0:00:02
   ------------- ----------------------

In [1]:
import pandas as pd
qa_pairs = [
    {"Question": "What is the capital of Uzbekistan?", "Answer": "Tashkent"},
    {"Question": "Who is the founder of Microsoft?", "Answer": "Bill Gates"},
    {"Question": "What is the tallest building in the world?", "Answer": "Burj Khalifa"},
    {"Question": "What is the currency of Brazil?", "Answer": "Real"},
    {"Question": "What is the boiling point of mercury in Celsius?", "Answer": "-38.83"},
    {"Question": "What is the most commonly spoken language in the world?", "Answer": "Mandarin"},
    {"Question": "What is the diameter of the Earth?", "Answer": "12,742 km"},

]

df = pd.DataFrame(qa_pairs)

In [2]:
df

Unnamed: 0,Question,Answer
0,What is the capital of Uzbekistan?,Tashkent
1,Who is the founder of Microsoft?,Bill Gates
2,What is the tallest building in the world?,Burj Khalifa
3,What is the currency of Brazil?,Real
4,What is the boiling point of mercury in Celsius?,-38.83
5,What is the most commonly spoken language in t...,Mandarin
6,What is the diameter of the Earth?,"12,742 km"
