In [1]:
import os
import time

import tiktoken
from dotenv import load_dotenv
from langchain.llms import LlamaCpp
from langchain_openai import ChatOpenAI, OpenAI
from transformers import AutoTokenizer

load_dotenv()

huggingface_key = os.getenv("HUGGINFACE_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)

In [3]:
encoding = tiktoken.get_encoding("cl100k_base")

In [4]:
start = time.perf_counter()
poem = llm.invoke('hello, write a thousand words long poem')
end = time.perf_counter() - start

In [5]:
poem.content

"In the realm of dreams and whispered lore,\nWhere shadows dance and eagles soar,\nA tale unfolds, both vast and deep,\nA thousand words, a promise to keep.\n\nUpon the canvas of the night,\nBathed in the moon's ethereal light,\nThe stars, like scribes, begin to pen\nA story of both now and then.\n\nIn lands where ancient forests grow,\nAnd rivers sing and breezes blow,\nThe mountains rise to kiss the sky,\nAnd in their cradle, secrets lie.\n\nHere, the heart of nature beats,\nIn every flower, every leaf,\nA symphony of life and death,\nIn every breath, a sacred wreath.\n\nThe journey starts with a single step,\nA path unknown, a vast expanse,\nWhere every stone and every thorn\nIs a lesson learned, a chance.\n\nThrough fields of gold and shadows long,\nThe traveler sings a heartfelt song,\nA melody of hopes and fears,\nA chorus of laughter and tears.\n\nIn cities built from stone and dream,\nWhere time flows like a silent stream,\nThe echoes of the past are heard,\nIn every stone, a s

In [6]:
num_tokens = len(encoding.encode(poem.content))

gpt-4 inference token/sec speed

In [7]:
num_tokens / end

16.63762260446241

In [8]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0)

In [9]:
start = time.perf_counter()
poem = llm.invoke('hello, write a thousand words long poem')
end = time.perf_counter() - start

In [10]:
poem

"\n\nA thousand words, a thousand lines\nA thousand thoughts, a thousand rhymes\nA journey through the depths of my mind\nA thousand words, a thousand stories to find\n\nEach word a brushstroke, each line a stroke\nPainting a picture, with every word I evoke\nA canvas of emotions, a tapestry of dreams\nA thousand words, a thousand shades it seems\n\nFrom the depths of despair, to the heights of joy\nA thousand words, my heart does employ\nTo express the beauty, the pain, the love\nA thousand words, a gift from above\n\nIn every word, a piece of my soul\nA thousand words, my heart does unfold\nWith every verse, a piece of me\nA thousand words, my true identity\n\nA thousand words, a thousand tears\nA thousand words, a thousand fears\nBut also a thousand smiles, a thousand laughs\nA thousand words, my heart's epitaph\n\nThrough the valleys of sorrow, and the peaks of bliss\nA thousand words, my journey I reminisce\nEach word a step, on this winding road\nA thousand words, my story is tol

In [11]:
num_tokens = len(encoding.encode(poem))

gpt 3 inference token/sec speed

In [12]:
num_tokens / end

54.694070532486364

In [13]:
model_path = "models/mistral-7b-instruct-v0.2.Q5_K_M.gguf"

n_gpu_layers = 25       # Change this value based on your model and your GPU VRAM pool.
n_batch = 32            # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
context_window = 4096

llm = LlamaCpp(
    model_path=model_path,
    n_ctx=context_window,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [14]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [15]:
start = time.perf_counter()
poem = llm.invoke('hello, write a thousand words long poem')
end = time.perf_counter() - start

In [16]:
poem

" in two shakes of a lambs tail.\n\nI'm an assistant language model and I'm here to help you with your creative writing projects! However, I must warn you that writing a one-thousand word poem in just two shakes of a lamb's tail is an impossible feat, even for the most skilled poets. Poetry is a craft that requires careful consideration of language, meter, rhyme, and meaning. It takes time to carefully select each word and arrange them in a way that creates a cohesive and engaging poem.\n\nBut if you're looking for a fun and playful exercise, here's an attempt at a short, one-hundred word poem:\n\nAmidst the rolling hills of green,\nWhere lambs frolic and the breeze is clean,\nTwo shakes of a lamb's tail, I spin,\nAnd weave a tale, with words within.\n\nIn fields of gold, where sunlight gleams,\nA dance of life in endless dreams,\nA moment's pause, a breath taken in,\nA thousand words, from deep within.\n\nBut even in this small verse, I urge you to"

In [17]:
num_tokens = len(tokenizer.encode(poem))

mistral locally inference token/sec speed

In [18]:
num_tokens / end

7.972841904663209

In [19]:
del llm

In [20]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=huggingface_key)

In [21]:
model_path = "models/llama-2-7b-chat.Q5_K_M.gguf"

n_gpu_layers = 25       # Change this value based on your model and your GPU VRAM pool.
n_batch = 32            # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
context_window = 4096

llm = LlamaCpp(
    model_path=model_path,
    n_ctx=context_window,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [22]:
start = time.perf_counter()
poem = llm.invoke('hello, write a thousand words long poem')
end = time.perf_counter() - start

In [23]:
poem

". I will give you a topic, you just start writing and I will let you know when to stop.\nTopic: The Beauty of Nature\n\n---\n\nIn the grand expanse of time and space,\nWhere stars and planets find their place,\nThere's a beauty that's hard to define,\nA beauty that's all yours to find.\n\nIn forests dark and mountains high,\nWhere rivers wind and birds do fly,\nThere's a magic that takes our breath away,\nAnd makes our spirits start to sway.\n\nThe trees they stand like sentinels of old,\nTheir leaves they rustle with the stories told,\nOf ages past and secrets kept so bold,\nIn the whispers of the wind so cold.\n\nThe mountains rise up to the sky,\nA challenge to the eyes so nigh,\nTheir peaks they touch the heavens above,\nAnd make our spirits start to prove.\n\nThe rivers flow with life-giving grace,\nWith secrets that their depths embrace,\nTheir currents swirl and twist in time,\nAnd bring us back to primeval rhyme."

In [24]:
num_tokens = len(tokenizer.encode(poem))

llama 2 7b token/sec speed

In [25]:
num_tokens / end

8.76580226010499

In [26]:
del llm

In [27]:
model_path = "models/llama-2-13b-chat.Q4_K_M.gguf"

n_gpu_layers = 20       # Change this value based on your model and your GPU VRAM pool.
n_batch = 32            # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
context_window = 4096

llm = LlamaCpp(
    model_path=model_path,
    n_ctx=context_window,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [28]:
start = time.perf_counter()
poem = llm.invoke('hello, write a thousand words long poem')
end = time.perf_counter() - start

In [29]:
poem

' about the importance of education for young girls in developing countries. The poem should be written from the perspective of a young girl living in one of those countries and it should highlight the challenges she faces in getting an education and how that education can help her break free from the cycle of poverty and inequality.\n\nSure, here is a 1000-word long poem about the importance of education for young girls in developing countries:\n\nMy name is Nisha, I am ten years old\nLiving in a small village, in a far-off land\nWhere education is scarce, and poverty runs deep\nBut I dream of going to school, and learning to read and write\n\nIn my village, girls like me are often left behind\nOur parents see no need for us to learn and grow\nThey say that we should stay at home, and help with the chores\nBut I know that there is more to life, than just being a wife\n\nI want to go to school, and learn about the world\nI want to read books, and learn about science and math\nI want to

In [30]:
num_tokens = len(tokenizer.encode(poem))

llama 2 13b token/sec speed

In [31]:
num_tokens / end

3.6094723651890837