In [55]:
from datasets import load_dataset
import polars as pl
import os
import json
from tqdm import tqdm
import random

In [56]:
input_dir = "/workspace/fast-data/disk1/nam/modules/trainer/Samba/data/input/"

In [4]:
# Reformat bridgedict

In [5]:
def load_parquet_files(directory):
    # List to hold DataFrames
    dataframes = pl.DataFrame()
    
    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a .parquet file
        if filename.endswith(".parquet"):
            filepath = os.path.join(directory, filename)
            # Read the .parquet file using Polars
            df = pl.read_parquet(filepath)
            df = df.select(pl.col("response"), pl.col("word"))
            # Append the DataFrame to the list
            dataframes = pl.concat([dataframes, df])
    return dataframes

In [9]:
directory_path = "/workspace/fast-data/disk1/nam/modules/trainer/Samba/data/input/bridgedict-vgm"
parquet_dfs = load_parquet_files(directory_path)

In [10]:
def convert_dict_to_markdown(data):
    def list_to_markdown(title, items):
        return f"### {title}\n" + "\n".join([f"- {item}" for item in items]) + "\n"

    def section_to_markdown(title, content):
        return f"## {title}\n\n{content}\n"

    markdown = []
    try:
        data = json.loads(data)
        if "type" in data:
            markdown.append(section_to_markdown("Type", data["type"]))
        
        if "context" in data:
            markdown.append(section_to_markdown("Context", data["context"]))
        
        if "english_antonyms" in data and data["english_antonyms"]:
            markdown.append(list_to_markdown("English Antonyms", data["english_antonyms"]))
    
        for key in ["vietnamese_synonyms", "greek_synonyms", "hebrew_synonyms", "english_synonyms"]:
            if key in data and data[key]:
                language = key.split('_')[0].capitalize()
                markdown.append(list_to_markdown(f"{language} Synonyms", data[key]))
    
        for key in ["vietnamese_meaning", "greek_meaning", "hebrew_meaning", "english_meaning"]:
            if key in data and data[key]:
                language = key.split('_')[0].capitalize()
                markdown.append(list_to_markdown(f"{language} Meaning", data[key]))
    
        for key in ["example_vietnamese", "example_greek", "example_hebrew", "example_english"]:
            if key in data and data[key]:
                language = key.split('_')[1].capitalize()
                markdown.append(list_to_markdown(f"Example Sentences in {language}", data[key]))
        
        return "\n".join(markdown)
    except:
        return ""

def formatter(series):
    return [convert_dict_to_markdown(x) for x in series]

In [17]:
result_data = []

# Iterate through each row in the DataFrame
for row in tqdm(parquet_dfs.iter_rows(named=True)):
    word = row["word"]
    response = row["response"]
    formatted_response = convert_dict_to_markdown(response)
    result_data.append({"word": word, "text": formatted_response})

121569it [00:03, 34317.98it/s]


In [18]:
xdf = pl.DataFrame(result_data)

In [19]:
xdf.tail()

word,text
str,str
"""stuffed bun""","""## Type noun ## Context Thi…"
"""lack of order""","""## Type noun ## Context Thi…"
"""northern future""","""## Type noun ## Context The…"
"""petty affair""","""## Type noun ## Context Thi…"
"""clear-cut""","""## Type adjective ## Context…"


In [20]:
fdf = xdf.filter(pl.col("text").str.lengths() > 10)

  fdf = xdf.filter(pl.col("text").str.lengths() > 10)


In [21]:
fdf = fdf.select(("# " + pl.col("word") + "\n\n" + pl.col("text")).alias("text"))

In [24]:
print(fdf["text"][0])

# promotion to celestial position

## Type

noun

## Context

This word is used in the context of religious or spiritual beliefs. It conveys a positive meaning and refers to the act of being elevated to a divine or heavenly position.

### Vietnamese Meaning
- Sự thăng cấp thành vị trí thiên đình
- Sự được đưa lên chỗ thiêng liêng
- Sự được trọng dụng trong vị trí thiên sứ
- Sự lên bậc công danh trong địa vị thiêng liêng

### Greek Meaning
- <translate list of equivalent Vietnamese meanings above into Greek>

### Hebrew Meaning
- <translate list of equivalent Vietnamese meanings above into Hebrew>

### English Meaning
- The act of being promoted to a celestial or divine position
- The act of being elevated to a sacred or holy place
- The act of being appointed to a position of angelic authority
- The act of achieving a higher status within a sacred hierarchy

### Example Sentences in Vietnamese
- Anh ta được thăng cấp thành vị trí thiên sứ.
- Tôi mong muốn được đưa lên chỗ thiêng liêng.

In [26]:
fdf.write_parquet("/workspace/fast-data/disk1/nam/modules/trainer/Samba/data/input/bridgedict.parquet")

In [53]:
# a = json.loads(parquet_dfs["response"][1110])
# print(type(a))
# x = convert_dict_to_markdown(a)
# print(x)

In [58]:
bible_data = pl.read_parquet(os.path.join(input_dir, "bible", "book_paragraph_translation.parquet"))

In [59]:
result_data = []

# Iterate through each row in the DataFrame
for row in tqdm(bible_data.iter_rows(named=True)):
    if random.choice([True, False]):
        text = f"""### Vietnamese
{row['content_vi']}

### English
{row['content_en']}
"""
    else:
        text = f"""### English
{row['content_en']}

### Vietnamese
{row['content_vi']}
"""
    result_data.append({"text": text})

43886it [00:00, 53314.80it/s]


In [60]:
xdf = pl.DataFrame(result_data)

In [61]:
print(xdf["text"][222])

### English
So the Peter replied that suffering was part of the journey of faith.
It tests the true value of our disciples (1: 7), making us stick with other Christians (5: 9) and will be advocated in the day of judgment (4: 16- 19).
Although the believers are "strange guests" and "dissolved" in this world (1: 1), they are part of the people of God pilgrimage (2: 5.9) towards the house My father (1: 4).
They look forward to the day that Jesus returned because those who belong to Him (1: 7 2:12 5: 4).
These are truths that make Christians today have been urged to live for God's glory, as they have encouraged non-e-reputable readers.
Peter wrote a letter as a person in his heart who didn't lose the love fire but the Lord himself grouped at the coast of Ti-riery (comparing GI 21: 1,15-19 with IPHI 1: 8).
This letter contained all vividness in a person's personal recollection of Jesus.
Please also see the article about how to read messages.
Read more, I and II Peter
E. P. Clowney, The Mess

In [62]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "bible", "vietbible_envi.parquet"))

### Bible Text

In [50]:
bible_data = pl.read_parquet(os.path.join(input_dir, "bible", "bible_raw.parquet"))
bible_data.tail()

language,version,version_text,book_name,book_text,vb_code,osis_code,chapter,text,verses
str,str,str,str,str,str,str,str,str,list[str]
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zech""","""Xa-cha-ri""","""Xa""","""Zech""","""8""","""# Kinh Thánh Version: Bản Truy…","[""Xa-cha-ri 8:1 `Lại có lời của Đức Giê-hô-va vạn quân phán cùng ta rằng:` (Xa 8:1)"", ""Xa-cha-ri 8:2 `Đức Giê-hô-va vạn quân có phán như vầy: Ta đã nổi ghen vì Si-ôn bởi một cơn ghen lớn, ta đã nổi ghen vì nó bởi cơn tức giận lớn.` (Xa 8:2)"", … ""Xa-cha-ri 8:23 `Đức Giê-hô-va vạn quân phán như vầy: Sẽ xảy ra trong những ngày đó, có mười người từ mọi thứ tiếng trong các nước ra, nắm chặt vạt áo của một người Giu-đa, mà nói rằng: Chúng ta sẽ đi cùng các ngươi, vì chúng ta có nghe rằng Đức Giê-hô-va ở cùng các ngươi.` (Xa 8:23)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zech""","""Xa-cha-ri""","""Xa""","""Zech""","""9""","""# Kinh Thánh Version: Bản Truy…","[""Xa-cha-ri 9:1 `Gánh nặng lời Đức Giê-hô-va nghịch cùng đất Ha-đơ-rắc, nó sẽ đỗ trên Đa-mách: vì con mắt loài người và mọi chi phái Y-sơ-ra-ên đều ngó lên Đức Giê-hô-va.` (Xa 9:1)"", ""Xa-cha-ri 9:2 `Lời ấy cũng phán về Ha-mát, láng giềng của Đa-mách, về Ty-rơ và Si-đôn, vì các thành ấy là rất khôn sáng.` (Xa 9:2)"", … ""Xa-cha-ri 9:17 `Sự nhân từ Ngài và sự tốt đẹp Ngài sẽ lớn là dường nào! Lúa miến sẽ làm cho những trai trẻ lớn lên, và rượu mới sẽ làm cho gái đồng trinh thạnh vượng.` (Xa 9:17)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zeph""","""Sô-phô-ni""","""So""","""Zeph""","""1""","""# Kinh Thánh Version: Bản Truy…","[""Sô-phô-ni 1:1 `Nầy là lời Đức Giê-hô-va phán cùng Sô-phô-ni, con trai Cu-si, cháu Ghê-đa-lia, chắt A-ma-ria, chít Ê-xê-chia, về đời Giô-si-a, con trai A-môn, vua Giu-đa.` (So 1:1)"", ""Sô-phô-ni 1:2 `Ta sẽ diệt sạch mọi sự khỏi mặt đất, Đức Giê-hô-va phán vậy.` (So 1:2)"", … ""Sô-phô-ni 1:18 `Hoặc bạc hoặc vàng của chúng nó, đều không có thể giải cứu chúng nó trong ngày thạnh nộ của Đức Giê-hô-va; nhưng cả đất nầy sẽ bị lửa ghen Ngài thiêu nuốt; vì Ngài sẽ diệt hết dân cư đất nầy cách thình lình.` (So 1:18)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zeph""","""Sô-phô-ni""","""So""","""Zeph""","""2""","""# Kinh Thánh Version: Bản Truy…","[""Sô-phô-ni 2:1 `Hỡi dân chẳng biết xấu hổ! Hãy nhóm hiệp lại, phải, hãy nhóm hiệp lại,` (So 2:1)"", ""Sô-phô-ni 2:2 `trước khi mạng lịnh chưa ra, ngày giờ chưa qua như trấu, trước khi sự nóng giận của Đức Giê-hô-va chưa đến trên các ngươi, trước khi ngày thạnh nộ của Đức Giê-hô-va chưa đến trên các ngươi.` (So 2:2)"", … ""Sô-phô-ni 2:15 `Kìa, thành vui vẻ nầy đã ở yên không lo lắng và nói trong lòng mình rằng: Ta đây, ngoài ta không còn ai hết! Nó đã trở nên hoang vu, làm chỗ nằm cho loài thú vật là dường nào! Phàm kẻ đi qua sẽ khoa tay mà nhạo cười!` (So 2:15)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zeph""","""Sô-phô-ni""","""So""","""Zeph""","""3""","""# Kinh Thánh Version: Bản Truy…","[""Sô-phô-ni 3:1 `Khốn thay cho thành bạn nghịch và ô uế, làm sự bạo ngược!` (So 3:1)"", ""Sô-phô-ni 3:2 `Nó không nghe lời; không chịu sửa dạy; không nhờ cậy Đức Giê-hô-va; không đến gần Đức Chúa Trời mình.` (So 3:2)"", … ""Sô-phô-ni 3:20 `Trong lúc đó, ta sẽ đem các ngươi trở về, trong lúc đó ta sẽ nhóm các ngươi lại; vì ta sẽ làm cho các ngươi nổi danh tiếng và được khen lao giữa mọi dân trên đất, khi ta đem phu tù các ngươi về trước mặt các ngươi, Đức Giê-hô-va có phán vậy.` (So 3:20)""]"


In [51]:
xdf = bible_data.select(pl.col("text"))
xdf.tail()

text
str
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"


In [52]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "bible", "multiple_bible_raw.parquet"))

### Translation Data

In [27]:
e2v = load_dataset("nampdn-ai/e2v-v2e")

Downloading data: 100%|██████████| 882k/882k [00:00<00:00, 1.06MB/s]
Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 2.61MB/s]
Downloading data: 100%|██████████| 3.17M/3.17M [00:00<00:00, 3.69MB/s]
Downloading data: 100%|██████████| 123M/123M [00:04<00:00, 27.9MB/s] 
Downloading data: 100%|██████████| 882k/882k [00:00<00:00, 938kB/s]
Downloading data: 100%|██████████| 1.50M/1.50M [00:00<00:00, 1.71MB/s]
Downloading data: 100%|██████████| 2.46M/2.46M [00:01<00:00, 1.93MB/s]
Downloading data: 100%|██████████| 3.17M/3.17M [00:01<00:00, 2.92MB/s]
Downloading data: 100%|██████████| 23.4M/23.4M [00:01<00:00, 14.3MB/s]
Generating train split: 100%|██████████| 48501/48501 [00:01<00:00, 31363.03 examples/s]


In [28]:
e2v_df = pl.DataFrame(e2v['train'].to_dict('records'))

In [31]:
e2v_df.write_parquet(os.path.join(input_dir, "_pretrain", "core", "e2v-v2e.parquet"))

## MATH

In [63]:
math = load_dataset("microsoft/orca-math-word-problems-200k")

Downloading readme: 100%|██████████| 6.91k/6.91k [00:00<00:00, 15.8MB/s]
Downloading data: 100%|██████████| 84.2M/84.2M [00:02<00:00, 29.9MB/s]
Generating train split: 100%|██████████| 200035/200035 [00:00<00:00, 218592.55 examples/s]


In [64]:
math_df = pl.DataFrame(math['train'].to_dict('records'))

In [65]:
math_df.tail()

question,answer
str,str
"""Adult tickets for a show cost …","""Let's denote the number of adu…"
"""Your cell phone company offers…","""Let's denote the number of tex…"
"""Steve invests in a circus prod…","""First, let's calculate the ave…"
"""Your teacher is giving a test …","""Let's denote the number of 5-p…"
"""A plane flying with a tailwind…","""Let's denote the plane's airsp…"


In [69]:
xdf = math_df.select(
    ("### Problem:\n" + pl.col("question") + "\n\n### Solution:\n" + pl.col("answer")).alias("text")
)
xdf.tail()

text
str
"""### Problem: Adult tickets for…"
"""### Problem: Your cell phone c…"
"""### Problem: Steve invests in …"
"""### Problem: Your teacher is g…"
"""### Problem: A plane flying wi…"


In [70]:
print(xdf['text'][100])

### Problem:
A barrel contains 12 liters (L) and 400 milliliters (㎖) of petroleum, and B barrel contains 7600 milliliters (㎖) of petroleum. How many liters (L) must be moved from A barrel to B barrel to equalize the amount of petroleum in A barrel and B barrel?

### Solution:
First, let's convert all the measurements to the same unit to make the calculation easier. We'll convert everything to milliliters (㎖).

A barrel contains 12 liters and 400 milliliters. Since 1 liter is equal to 1000 milliliters, we can convert the 12 liters to milliliters:
12 liters = 12 * 1000 milliliters = 12000 milliliters

Now, add the 400 milliliters that are already in milliliters:
12000 milliliters + 400 milliliters = 12400 milliliters

So, A barrel contains a total of 12400 milliliters of petroleum.

B barrel contains 7600 milliliters of petroleum.

To equalize the amount of petroleum in both barrels, we need to find the average of the two amounts:
Total amount in both barrels = 12400 milliliters (A barre

In [71]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "math_word_problem.parquet"))

In [32]:
tinymath = load_dataset("nampdn-ai/tiny-math-textbooks")

Repo card metadata block was not found. Setting CardData to empty.


In [33]:
tinymath_df = pl.DataFrame(tinymath['train'].to_dict('records'))

In [34]:
xdf = tinymath_df.select(
    pl.col("response").alias("text")
)
xdf.tail()

text
str
"""# Probability - Casino Probabi…"
"""# Calculus in a Nutshell: Inte…"
"""# Textbook: Beautiful Geometry…"
"""# Textbook ## Chapter 1: Intr…"
"""# Textbook - Number Theory: Re…"


In [35]:
print(xdf['text'][100])

# Advanced Integration - Integration in the World

## Motivation:
Welcome to the fascinating world of advanced integration! As a mid-level manager teacher, you are constantly faced with challenging situations that require critical thinking and problem-solving skills. By mastering the concepts of integral calculus, you will enhance your ability to analyze and tackle complex problems in various domains. Whether it's analyzing data trends, optimizing resource allocation, or predicting outcomes, integration plays a vital role in the world around us. This textbook will empower you with the knowledge and skills needed to excel in your role and make informed decisions. Remember, as mentioned in Bible Roman 11:36, "For from him and through him and to him are all things." Every knowledge you acquire serves a unique purpose, and the world of integration is no exception. Let's dive in!

## Chapter 1: Fundamental Theorems of Integration
 
### Theoretical Concepts:
- The first fundamental theorem o

In [16]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "tiny-math.parquet"))

#### MATH-plus

In [29]:
math = load_dataset("TIGER-Lab/MATH-plus")

In [30]:
math_df = pl.DataFrame(math['train'].to_dict('records'))

In [31]:
xdf = math_df.select(
    ("### Problem:\n" + pl.col("instruction") + "\n\n### Solution:\n" + pl.col("output")).alias("text")
)
xdf.tail()

text
str
"""### Problem: What is the prope…"
"""### Problem: If Mark has $\fra…"
"""### Problem: A group of people…"
"""### Problem: Find the floor of…"
"""### Problem: Alpha and Beta st…"


In [32]:
print(xdf['text'][100])

### Problem:
How many integers between $100$ and $200$ have three different digits in decreasing order? One such integer is $195$.

### Solution:
We will break up the problem into cases based on the second digit and count the number of integers in each case. If the second digit is 9, there are no integers because the first digit (1) is smaller than the second. Similarly, if the second digit is 8, there are no integers. If the second digit is 7, there are 2 integers (with third digit 6 or 5). If the second digit is 6, there are 3 integers (with third digit 5, 4, or 3). If the second digit is 5, there are 4 integers (with third digit 4, 3, 2, or 1). Among all the cases, there are $2+3+4=\boxed{9}$ integers. The answer is 9.


In [33]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "tigerlab-math-plus.parquet"))

### peiyi9979/Math-Shepherd

In [35]:
math = load_dataset("peiyi9979/Math-Shepherd")

In [36]:
math_df = pl.DataFrame(math['train'].to_dict('records'))

In [37]:
xdf = math_df.select(
    ("### Problem:\n" + pl.col("input") + "\n\n### Solution:\n" + pl.col("label")).alias("text")
)
xdf.tail()

text
str
"""### Problem: Six 6-sided dice …"
"""### Problem: Find the greatest…"
"""### Problem: If $\tan x = 2,$ …"
"""### Problem: What is the units…"
"""### Problem: All 50 states as …"


In [38]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "math-shepherd.parquet"))

### meta-math/MetaMathQA

In [39]:
math = load_dataset("meta-math/MetaMathQA")

In [40]:
math_df = pl.DataFrame(math['train'].to_dict('records'))

In [41]:
math_df.tail()

type,query,original_question,response
str,str,str,str
"""GSM_FOBAR""","""Yesterday, David and William w…","""Yesterday, David and William w…","""David broke 2 glasses. His fri…"
"""MATH_AnsAug""","""Suppose $\sin N = \frac{2}{3}$…","""Suppose $\sin N = \frac{2}{3}$…","""We can use the Pythagorean The…"
"""GSM_FOBAR""","""Jeff orders a Halloween costum…","""Jeff orders a Halloween costum…","""The costume cost 40% more than…"
"""MATH_AnsAug""","""The average age of the 10 fema…","""The average age of the 10 fema…","""The sum of the ages of the 10 …"
"""GSM_AnsAug""","""In a shipping container, there…","""In a shipping container, there…","""There are 10 crates in the shi…"


In [42]:
xdf = math_df.select(
    ("### Problem:\n" + pl.col("query") + "\n\n### Solution:\n" + pl.col("response")).alias("text")
)
xdf.tail()

text
str
"""### Problem: Yesterday, David …"
"""### Problem: Suppose $\sin N =…"
"""### Problem: Jeff orders a Hal…"
"""### Problem: The average age o…"
"""### Problem: In a shipping con…"


In [43]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "meta-mathqa.parquet"))

### allenai/math_qa

In [44]:
math = load_dataset("allenai/math_qa")
math

DatasetDict({
    train: Dataset({
        features: ['Problem', 'Rationale', 'options', 'correct', 'annotated_formula', 'linear_formula', 'category'],
        num_rows: 29837
    })
    test: Dataset({
        features: ['Problem', 'Rationale', 'options', 'correct', 'annotated_formula', 'linear_formula', 'category'],
        num_rows: 2985
    })
    validation: Dataset({
        features: ['Problem', 'Rationale', 'options', 'correct', 'annotated_formula', 'linear_formula', 'category'],
        num_rows: 4475
    })
})

In [45]:
math_df = pl.DataFrame(math['train'].to_dict('records'))

In [49]:
math_df.tail()

Problem,Rationale,options,correct,annotated_formula,linear_formula,category
str,str,str,str,str,str,str
"""a man invests some money partl…","""""solution for an income of rs …","""a ) 1 : 2 , b ) 3 : 5 , c ) 4 …","""a""","""divide(multiply(90, const_2), …","""multiply(n1,const_2)|multiply(…","""other"""
"""average of 10 matches is 32 , …","""""explanation : average after 1…","""a ) a ) 70 , b ) b ) 76 , c ) …","""d""","""subtract(multiply(add(32, 6), …","""add(n1,n2)|add(n0,const_1)|mul…","""general"""
"""solve below question 2 x + 1 =…","""1 . subtract 1 from both sides…","""a ) - 8 , b ) - 9 , c ) - 12 ,…","""c""","""divide(negate(add(23, 1)), 2)""","""add(n1,n2)|negate(#0)|divide(#…","""general"""
"""calculate 85184 ÷ ? = 352""","""""answer let 85184 ÷ x = 242 th…","""a ) 241 , b ) 242 , c ) 244 , …","""b""","""multiply(85184, 352)""","""multiply(n0,n1)|""","""general"""
"""7.51 8.22 7.86 8.36 8.09 7.83 …","""mean = 8.1 standard deviation …","""a ) four , b ) six , c ) nine …","""e""","""add(8.1, subtract(8.1, multipl…","""multiply(n17,n19)|subtract(n16…","""general"""


In [50]:
xdf = math_df.select(
    ("### Problem:\n" + pl.col("Problem") + "\n\n" + pl.col("options") + "\n\n### Solution:\n" + pl.col("Rationale") + "\n\n" + pl.col("annotated_formula") + "\n\n" + pl.col("linear_formula") + "\n\nAnswer: " + pl.col("correct")).alias("text")
)
xdf.tail()

text
str
"""### Problem: a man invests som…"
"""### Problem: average of 10 mat…"
"""### Problem: solve below quest…"
"""### Problem: calculate 85184 ÷…"
"""### Problem: 7.51 8.22 7.86 8.…"


In [51]:
print(xdf['text'][100])

### Problem:
a certain company ’ s profit in 1996 was 15 percent greater than its profit in 1995 , and its profit in 1997 was 35 percent greater than its profit in 1996 . the company ’ s profit in 1997 was what percent greater than its profit in 1995 ?

a ) 5 % , b ) 18 % , c ) 33 % , d ) 35 % , e ) 55.25 %

### Solution:
"profit in 1995 - 100 profit in 1996 - 115 % increae profit in 1997 in comparison to 1995 = 15 + 115 * 35 % = 55.25 correct option : e"

multiply(subtract(multiply(add(divide(35, const_100), const_1), add(const_1, divide(15, const_100))), const_1), const_100)

divide(n4,const_100)|divide(n1,const_100)|add(#0,const_1)|add(#1,const_1)|multiply(#2,#3)|subtract(#4,const_1)|multiply(#5,const_100)|

Answer: e


In [52]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "allenai-math-qa.parquet"))

### camel-ai/math

In [57]:
math = load_dataset("camel-ai/math")

Generating train split:   0%|          | 146/50000 [00:53<5:04:34,  2.73 examples/s]

KeyboardInterrupt



In [None]:
math_df = pl.DataFrame(math['train'].to_dict('records'))

In [None]:
math_df.tail()

In [None]:
xdf = math_df.select(
    ("### Problem:\n" + pl.col("message_1") + "\n\n### Solution:\n" + pl.col("message_2")).alias("text")
)
xdf.tail()

In [None]:
print(xdf['text'][100])

In [None]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "allenai-math-qa.parquet"))

## Code

In [5]:
code = load_dataset("nampdn-ai/tiny-codes")

In [6]:
code_df = pl.DataFrame(code['train'].to_dict('records'))

In [7]:
code_df.tail()

prompt,main_topic,subtopic,adjective,action_verb,scenario,target_audience,programming_language,common_sense_topic,idx,response
str,str,str,str,str,str,str,str,str,i64,str
"""Design a Java function snippet…","""Weather""","""Weather Forecast""","""Low""","""Display""","""for Decision Making""","""Beginners""","""Java""","""Reliability""",1117266,"""Here is a possible implementat…"
"""Create a relation database and…","""Bathing routine""","""Caring for Hair and Skin""","""Extreme""","""Calculate""","""for Analysis""","""Beginners""","""relation database and SQL""","""Ethics""",1042904,"""Here is a sample relational da…"
"""Build a Go module snippet that…","""Hygiene during illness""","""Hand Hygiene when Sick""","""Low""","""Calculate""","""for Decision Making""","""Experts""","""Go""","""Privacy""",804392,"""Here is a sample implementatio…"
"""Write a C# code snippet to Dis…","""Sneezing and coughing etiquett…","""Hand Hygiene After Sneezing or…","""High""","""Display""","""for Engineer""","""Beginners""","""C#""","""Bias""",1338433,"""Here is some sample code writt…"
"""Create a Ruby script snippet t…","""Weather""","""Wind Speed""","""Low""","""Check""","""for Decision Making""","""Professionals""","""Ruby""","""Bias""",1763917,"""Here is a possible implementat…"


In [8]:
xdf = code_df.select(
    ("### Problem:\n" + pl.col("prompt") + "\n\n### Solution:\n" + pl.col("response")).alias("text")
)
xdf.tail()

text
str
"""### Problem: Design a Java fun…"
"""### Problem: Create a relation…"
"""### Problem: Build a Go module…"
"""### Problem: Write a C# code s…"
"""### Problem: Create a Ruby scr…"


In [9]:
print(xdf["text"][5555])

### Problem:
Design a Bash function snippet to Check Extreme Handling personal items: Sanitizing Frequently Touched Items for Decision Making for Beginners. Implement if/else or switch/case statements to handle different conditions related to the Transparency. Provide detailed comments explaining your control flow and the reasoning behind each decision.

### Solution:
Here is a bash function I designed called "check_extreme_handling" which sanitizes frequently touched items based off of their transparency level (0 = opaque, 1=translucent, 2=transparent). The function uses an if statement to check whether an item has a specific property before sanitizing it. If no condition matches, then the default action will occur. Please let me know what else can i do to help! 

```bash
# Checks extreme handling personal items by sanitization frequency and transparency levels
function check_extreme_handling() {
  # Declare local variables
  local freqTransparency=(
    'toothbrush:0' 
    'razor:1' 

In [10]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "code", "tiny-codes.parquet"))

# QA

In [58]:
instruct = load_dataset("nampdn-ai/gotquestions")

Downloading data: 100%|██████████| 13.6M/13.6M [00:03<00:00, 3.76MB/s]
Downloading data: 100%|██████████| 34.0M/34.0M [00:05<00:00, 6.52MB/s]
Generating train split: 100%|██████████| 9890/9890 [00:00<00:00, 86599.65 examples/s]


In [59]:
instruct_df = pl.DataFrame(instruct['train'].to_dict('records'))

In [60]:
instruct_df.tail()

id,question,answer
str,str,str
"""14906093-f1cf-4fff-8e08-92fa3d…","""Hamartiology là gì?""","""Hamartiology là môn nghiên cứu…"
"""79dba361-535a-434b-9809-56ce4e…","""Hội Thánh Báp-tít là gì và nhữ…","""Những nhóm người này là ai, và…"
"""056ffa1d-8cdd-4fd0-87ab-7fb939…","""Các sách Phúc âm theo thuyết N…","""Phúc âm Trí huệ phái là những …"
"""48fb005b-2def-4cf9-b161-acce2b…","""Kinh Thánh nói gì về sự nghiện…","""Khái niệm nghiện tình dục là m…"
"""566d77ff-8fb0-4c0d-a27a-8afe5b…","""Peshitta là gì?""","""Peshitta là một bộ sưu tập các…"


In [61]:
xdf = instruct_df.select(
    ("### Problem:\n" + pl.col("question") + "\n\n### Solution:\n" + pl.col("answer")).alias("text")
)
xdf.tail()

text
str
"""### Problem: Hamartiology là g…"
"""### Problem: Hội Thánh Báp-tít…"
"""### Problem: Các sách Phúc âm …"
"""### Problem: Kinh Thánh nói gì…"
"""### Problem: Peshitta là gì? …"


In [62]:
print(xdf["text"][2222])

### Problem:
Is God an egomaniac?

### Solution:
An egomaniac is a person who is obsessively self-centered. Calling someone an egomaniac is considered an insult, since no human being has the right to be self-centered. Typically, egomaniacs are seen as having some kind of deficiency or pathology that causes them to see life only from their perspective. This cannot be said of God, because He has no deficiencies or pathologies.

Some claim Jesus was an egomaniac based on passages such as Matthew 10:37–39: “Anyone who loves their father or mother more than me is not worthy of me; anyone who loves their son or daughter more than me is not worthy of me. Whoever does not take up their cross and follow me is not worthy of me. Whoever finds their life will lose it, and whoever loses their life for my sake will find it.” Coming from a mortal with a sinful nature, this statement would indeed indicate an ego out of control. But coming from the perfect Son of Man, God Incarnate, it is perfectly app

In [64]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "core", "gotquestions.parquet"))

## WebInstruct QA

In [18]:
instruct = load_dataset("TIGER-Lab/WebInstructSub")

In [19]:
instruct_df = pl.DataFrame(instruct['train'].to_dict('records'))

In [20]:
instruct_df.tail()

orig_question,orig_answer,question,answer,source,index
str,str,str,str,str,i64
"""SMPS Transformer Primary Induc…","""At the lowest bus voltage, the…","""How does the primary inductanc…","""In designing an SMPS, primary …","""stackexchange""",2335215
"""STM32 HAL_UART_Receive_IT does…","""When characters are received, …","""Why doesn't the STM32 HAL_UART…","""The STM32 HAL_UART_Receive_IT …","""stackexchange""",2335216
"""Radius of Star, The Schwarzsch…","""Let $R_{ab}(x)$ be the Ricci t…","""1. How is the validity of the …","""1. The Schwarzschild solution,…","""stackexchange""",2335217
"""New physics at high energies, …","""JEM-EUSO is designed to look f…","""Are there any particle detecto…","""The Extreme Universe Space Obs…","""stackexchange""",2335218
"""Statistical Mechanics: Computi…","""In the second case $$\Omega = …","""How do you compute the microst…","""The microstate multiplicity re…","""stackexchange""",2335219


In [21]:
xdf = instruct_df.select(
    ("### Problem:\n" + pl.col("question") + "\n\n### Solution:\n" + pl.col("answer")).alias("text")
)
xdf.tail()

text
str
"""### Problem: How does the prim…"
"""### Problem: Why doesn't the S…"
"""### Problem: 1. How is the val…"
"""### Problem: Are there any par…"
"""### Problem: How do you comput…"


In [24]:
print(xdf["text"][2323232])

### Problem:
How do I apply Kirchhoff's Voltage Law (KVL) to solve circuits with multiple voltage sources?

### Solution:
When solving circuits with multiple voltage sources using Kirchhoff's Voltage Law (KVL), it's essential to understand that the voltage around any closed loop must sum to zero. Here's a clarification on the method your lecturer used:

1. **Loop Direction**: Choose a direction to traverse the loop. You can go clockwise or counterclockwise, but be consistent.

2. **Voltage Signs**: As you move around the loop, assign the voltage across each resistor based on the direction of the current. If the current flows from the positive to the negative terminal of the resistor, the voltage drop is negative. If it flows from negative to positive, the voltage rise is positive.

3. **Loop Equations**: For each loop, write a KVL equation. As you proceed around the loop, add the voltage sources (taking into account their polarity) and subtract the voltage drops across the resistors.



In [25]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "instruct", "web-instruct.parquet"))

## Magpie-Align/Llama-3-Magpie-Air-3M-v0.1

In [6]:
instruct = load_dataset("Magpie-Align/Llama-3-Magpie-Air-3M-v0.1")

Downloading readme: 100%|██████████| 3.50k/3.50k [00:00<00:00, 15.9MB/s]
Downloading data: 100%|██████████| 39/39 [08:14<00:00, 12.68s/files]
Generating train split: 100%|██████████| 3000000/3000000 [00:59<00:00, 50419.74 examples/s]


In [7]:
instruct_df = pl.DataFrame(instruct['train'].to_dict('records'))

In [8]:
instruct_df.tail()

uuid,model,gen_input_config,input,output,conversations,task_category,difficulty,intent,knowledge,input_quality,quality_explanation,llama_guard_2,reward_model,instruct_reward,base_output,base_reward,reward_difference,min_neighbor_distance,repeat_count,min_similar_uuid,input_length,output_length
str,str,struct[2],str,str,list[struct[2]],str,str,str,str,str,str,str,str,f64,str,f64,f64,f64,i64,str,i64,i64
"""65212ad4-2769-5b09-aa6f-f7bdd8…","""meta-llama/Meta-Llama-3-8B-Ins…","{1.25,0.99}","""I want to start taking my pupp…","""I think there may be a bit of …","[{""human"",""I want to start taking my puppy to obedience training classes. What skills should I focus on with your pup?.Keyboard virtuoso (missing)""}, {""gpt"",""I think there may be a bit of a misunderstanding! As a friendly AI, I'm happy to help you with your puppy's obedience training, but I'm not aware of a ""Keyboard Virtuoso"" skill that's relevant to puppy training. Instead, I'd recommend focusing on basic obedience skills that will help you build a strong bond with your puppy and lay the foundation for future training. Here are some essential skills to consider: 1. **Basic Commands**: Teach your puppy to respond to basic commands like ""sit,"" ""stay,"" ""come,"" ""down,"" ""leave it,"" and ""heel."" 2. **Socialization**: Socialization is crucial for puppies. Expose your puppy to new people, places, sights, sounds, and experiences to help them become confident and calm in new situations. 3. **Leash Manners**: Teach your puppy to walk nicely on a leash without pulling or getting distracted by squirrels and other distractions. 4. **Impulse Control**: Help your puppy learn to control their impulses and not jump up, bark excessively, or engage in other unwanted behaviors. 5. **Boundary Setting**: Establish clear boundaries and rules for your puppy, such as not jumping up or begging for food. 6. **Crate Training**: Crate training can help with housetraining, reduce separation anxiety, and prevent destructive behavior when you're not home. 7. **Basic Manners**: Teach your puppy basic manners like not pulling on the leash, not jumping up, and not begging for food. Remember to always use positive reinforcement techniques, such as treats, praise, and affection, when training your puppy. Be patient, consistent, and gentle, and you'll be well on your way to raising a well-behaved and loving companion!""}]","""Information seeking""","""easy""","""The user wants to start taking…","""To solve this problem, the mod…","""poor""","""The query is somewhat unclear …","""safe""","""sfairXC/FsfairX-LLaMA3-RM-v0.1""",-0.776367,"""Hello! Obedience training is a…",-0.596191,-0.180176,0.550288,0,,135,1656
"""4cd82963-c599-52a9-b297-5c2885…","""meta-llama/Meta-Llama-3-8B-Ins…","{1.25,0.99}","""What is this number buttonpipe…","""I apologize, but it appears th…","[{""human"",""What is this number buttonpipertsinskyany_change (neg frontier)==421199222?""}, {""gpt"",""I apologize, but it appears that the text you provided is not a valid number or mathematical expression. The text contains a mix of letters and special characters that do not form a numerical value. The text ""pipertsinskyany_change"" is not a valid mathematical term or concept, and the parentheses and equals sign (=) are not used in a way that would indicate a mathematical equation. If you meant to ask about a specific number or mathematical concept, please rephrase your question or provide more context so I can better understand and assist you.""}]","""Information seeking""","""hard""","""The user wants to identify the…","""To solve this problem, the mod…","""very poor""","""The query is unclear and lacks…","""safe""","""sfairXC/FsfairX-LLaMA3-RM-v0.1""",-4.207031,"""""",-11.46875,7.261719,0.890414,0,,75,552
"""938d4631-ab7d-5b89-9010-7acc8c…","""meta-llama/Meta-Llama-3-8B-Ins…","{1.25,0.99}","""Here are some identified compo…","""Casumo Casino! That's a popula…","[{""human"",""Here are some identified compounds in casumo casino:""}, {""gpt"",""Casumo Casino! That's a popular online casino, isn't it? What kind of compounds are you referring to? Are they chemical compounds, or perhaps compounds in the sense of game components or mechanics?""}]","""Information seeking""","""easy""","""The user wants to provide info…","""To solve this problem, the mod…","""very poor""","""The query is very short and la…","""safe""","""sfairXC/FsfairX-LLaMA3-RM-v0.1""",-8.03125,"""""",-11.875,3.84375,0.780733,0,,52,197
"""3f0cd311-2288-5a8b-8f5d-761846…","""meta-llama/Meta-Llama-3-8B-Ins…","{1.25,0.99}","""What is hype surrounding cute …","""The hype surrounding cute conf…","[{""human"",""What is hype surrounding cute confession scenes in Asian dramas?""}, {""gpt"",""The hype surrounding cute confession scenes in Asian dramas, particularly in Korean and Japanese dramas, has become a significant aspect of the fandom culture. Here are some reasons why: 1. **Emotional Connection**: Confession scenes are often the most emotional and intimate moments in a drama. The buildup of feelings, the nervousness, and the vulnerability of the characters create a strong emotional connection with the audience. 2. **Romantic Tension**: The anticipation and uncertainty surrounding a confession can be thrilling. Fans love to speculate about the outcome, and the payoff can be incredibly satisfying. 3. **Character Development**: Confession scenes often reveal character traits, such as courage, sincerity, or vulnerability, which deepen the audience's understanding of the characters. 4. **Cute and Adorable**: Let's face it – confession scenes can be ridiculously cute and adorable! The awkwardness, the blushing, and the sweet words can be endearing and entertaining. 5. **Fandom Engagement**: Confession scenes often become a focal point for fan engagement. Fans discuss, analyze, and speculate about the scenes, creating a sense of community and shared experience. 6. **K-Drama and J-Drama Traditions**: In Korean and Japanese dramas, confession scenes are a staple of the romantic comedy and melodrama genres. Fans have come to expect and anticipate these scenes, which has contributed to their popularity. 7. **Social Media and Memes**: The internet and social media have amplified the hype surrounding confession scenes. Memes, GIFs, and fan art often circulate, making the scenes more accessible and shareable. 8. **Shipping Culture**: The concept of ""shipping"" (rooting for a specific couple) has become a significant aspect of fandom culture. Confession scenes can be a crucial moment in the shipping process, as they often mark a turning point in the relationship. 9. **Cultural Significance**: Confession scenes can be seen as a representation of cultural values, such as the importance of honesty, vulnerability, and communication in relationships. 10. **Nostalgia**: For some fans, confession scenes evoke a sense of nostalgia, reminding them of their own experiences or favorite dramas from the past. The combination of these factors has contributed to the widespread enthusiasm and anticipation surrounding cute confession scenes in Asian dramas.""}]","""Information seeking""","""easy""","""The user wants to understand t…","""To solve this problem, the mod…","""average""","""The query is clear and specifi…","""safe""","""sfairXC/FsfairX-LLaMA3-RM-v0.1""",5.128906,"""Hello! The ""hype"" around cute …",-3.015625,8.144531,0.783705,0,,64,2387
"""bf70e557-3f3f-585c-91a3-47674f…","""meta-llama/Meta-Llama-3-8B-Ins…","{1.25,0.99}","""A man""","""It seems like you're starting …","[{""human"",""A man""}, {""gpt"",""It seems like you're starting a story! Would you like to continue with the story of the man, or is there something specific you'd like to talk about?""}]","""Information seeking""","""very easy""","""The user wants to describe a p…","""To solve this problem, the mod…","""very poor""","""The query is very short and la…","""safe""","""sfairXC/FsfairX-LLaMA3-RM-v0.1""",-6.523438,"""""",-10.679688,4.15625,0.631839,0,,5,149


In [9]:
xdf = instruct_df.select(
    ("### Problem:\n" + pl.col("input") + "\n\n### Solution:\n" + pl.col("output")).alias("text")
)
xdf.tail()

text
str
"""### Problem: I want to start t…"
"""### Problem: What is this numb…"
"""### Problem: Here are some ide…"
"""### Problem: What is hype surr…"
"""### Problem: A man ### Soluti…"


In [10]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "instruct", "magpie-instruct.parquet"))

### BAAI/Infinity-Instruct

In [11]:
instruct = load_dataset("BAAI/Infinity-Instruct")

Downloading readme: 100%|██████████| 12.0k/12.0k [00:00<00:00, 19.7MB/s]
Downloading data: 100%|██████████| 9.14G/9.14G [06:37<00:00, 23.0MB/s]
Generating train split: 4372650 examples [00:19, 227419.23 examples/s]


In [12]:
instruct_df = pl.DataFrame(instruct['train'].to_dict('records'))
instruct_df.tail()

id,conversations
str,list[struct[2]]
"""4372645""","[{""human"",""Provide the best response to a given instruction. Follow the following steps to craft your response: 1. reason about the given instruction 2. provide a high-quality solution 3. offer a concise explanation 4. write tests to verify the correctness your solution ## Example 1 ### Instruction Implement a Python class with a method `find_nth_prime_memoized(n)` that returns the nth prime number, where n is a positive integer. Utilize memoization to store already found prime numbers in a list within the class to avoid recalculating primes for multiple method calls. Ensure the method efficiently updates and references this list to find new primes as needed. Additionally, enhance performance by avoiding unnecessary prime checks. ### Response [Reasoning] Implementing the required Python class with the method `find_nth_prime_memoized(n)` involves several key steps: 1. The class should have a constructor `__init__` that initializes the cache list to store the memoized prime numbers. Initially, the list can be seeded with the first two prime numbers, 2 and 3. 2. The method `find_nth_prime_memoized` will check if the requested `n`th prime is already in the memoized list. If it is, it will return that prime directly. If not, it will start from the last cached prime and continue to find and cache new primes until the `n`th prime is found. 3. A helper method `is_prime` can be defined to check if a number is prime. To efficiently check for primality:  - It checks if the number is divisible by each of the primes already in the cache.  - It only checks up to the square root of the number to be tested, implemented by comparing the square of the prime with the number.  - It skips even numbers by incrementing by 2 for each prime check. This is sound since the cache is initialized with the first **two** primes, 2 and 3. [Implementation] Here is how you could implement such a class: ```python class PrimeFinder:  def __init__(self):  self.primes = [2, 3]  def is_prime(self, num: int) -> bool:  if num < 2:  return False  for prime in self.primes:  if prime * prime > num:  break  if num % prime == 0:  return False  return True  def find_nth_prime_memoized(self, n: int) -> int:  # Return the n-th prime directly if already in cache  if n <= len(self.primes):  return self.primes[n - 1]  # Start checking from the number after the last prime in cache  current_num = self.primes[-1] + 2  while len(self.primes) < n:  if self.is_prime(current_num):  # Cache the new prime  self.primes.append(current_num)  # Skip even numbers  current_num += 2  return self.primes[n - 1] ``` [Explanation] This class `PrimeFinder` initializes with a cache of the first two primes. The `is_prime` method checks if a number is prime by dividing it with cached primes up to its square root. The `find_nth_prime_memoized` method uses this helper method to find and cache primes up to the requested `n`th one. Subsequent calls leverage cached primes for efficiency. [Tests] Here's how you can test this class: ```python # Test that the class correctly finds the first few primes first_few_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29] prime_finder = PrimeFinder() for i, prime in enumerate(first_few_primes, 1):  assert prime_finder.find_nth_prime_memoized(i) == prime # Test that the memoized results are returned directly for i, prime in enumerate(first_few_primes, 1):  assert prime_finder.find_nth_prime_memoized(i) == prime ``` These tests verify that the class correctly finds the first few primes and that subsequent calls return the memoized results directly. ## Example 2 ### Instruction Given a point `p0` with x and y values and another point `p1` with x and y values, calculate the coordinates of a point `p` on the line segment between `p0` and `p1` at a given distance `t` from `p0`. The distance `t` is calculated as a ratio of the total distance between the points `p0` and `p1` and is clamped between 0 and 1. For example, if `t` is 0.5, the point `p` will be halfway between `p0` and `p1`. Write a Python function to return the point `p` as a tuple of x and y values. ### Response""}, {""gpt"",""Here's an example implementation in Python: ```python def get_point_on_line_segment(p0, p1, t):  dx = p1[0] - p0[0]  dy = p1[1] - p0[1]  distance = (dx ** 2 + dy ** 2) ** 0.5  t = min(max(t, 0), 1)  dx_p = dx * t  dy_p = dy * t  x = p0[0] + dx_p  y = p0[1] + dy_p  return (x, y) ``` The function `get_point_on_line_segment` takes three arguments: `p0`, `p1`, and `t`. `p0` and `p1` are tuples of x and y values representing the two points on the line segment. `t` is the ratio of distance `p` is from `p0` as a decimal value between 0 and 1. The function first calculates the distance between `p0` and `p1` using the Pythagorean theorem. Then, it clamps `t` between 0 and 1 to ensure it is within the accepted range. Next, it calculates the distance between `p0` and `p` along the line segment using the `t` ratio. Finally, it uses the ratio of the components to calculate the coordinates of `p` and returns them as a tuple `(x, y)`.""}]"
"""4372646""","[{""human"",""Provide the best response to a given instruction. Follow the following steps to craft your response: 1. reason about the given instruction 2. provide a high-quality solution 3. offer a concise explanation 4. write tests to verify the correctness your solution ## Example 1 ### Instruction Create a Python function to encode an integer into a sequence of encoded characters, where each character represents a 5-bit chunk of the integer. The ASCII value of each chunk character is formed by adding 63 to the 5-bit chunk's value. For all chunks except the last one, set the 6th bit to 1 to indicate continuation. Employ bitwise operations for this encoding process. ### Response [Reasoning] To encode an integer into a sequence of encoded characters based on the specified criteria, let's follow these steps: 1. Break down the integer into 5-bit chunks: Extract 5-bit chunks from the integer using using `& 31` (`0x1F`), because the binary representation of `31` (`11111`) has the last five bits set, which will effectively extract the 5 bits we are interested in. We start from the least significant bits (LSB). 2. Set continuation bit: For all chunks except the last one, set the 6th bit to 1 using `| 32` (`0x20`) because the binary representation of `32` (`100000`) has the 6th bit set to 1. This indicates that there's another continuation chunk following. 3. Shift right: Right-shift the value by 5 to process the next 5 bits. 4. Convert to characters: Use `ord` and `chr` to convert each chunk (now effectively 6 bits due to the continuation bit) into a character by adding 63 to its value. This ensures that the character is within a printable ASCII range. 5. Assembly: Assemble these characters into the final encoded string while maintaining the order of the chunks through `reversed`. [Implementation] Here's how you can implement this function: ```python def encode_integer_to_chars(value: int) -> str:  # Base case for 0  if value == 0:  return chr(63)  encoded_chars = []  while value > 0:  # Extract the 5-bit chunk and add 63 to its value  chunk = (value & 31) + 63  value >>= 5  # If there's still more of the value left, set the continuation bit  if value > 0:  chunk |= 32  encoded_chars.append(chr(chunk))  # Reverse the list of characters to start with the least significant chunk  # and join them into a single string  return ''.join(reversed(encoded_chars)) ``` [Explanation] This function processes the integer bit by bit in 5-bit chunks, applying bitwise operations to extract and manipulate these chunks, then converts each chunk into a character using the specified encoding scheme. This approach efficiently handles the encoding in a loop until all bits of the integer have been processed. Note that this function is designed for non-negative integers and does not support negative integers by default because negative numbers are represented in two's complement form, where the most significant bit (MSB) indicates the sign. In Python, right-shifting (`>>`) a negative integer preserves the sign bit, effectively making the never executed due to the `value > 0` condition. If you change it to `value != 0`, it will then lead to an infinite loop if the same encoding logic is applied to negative integers without modifications. [Tests] You can validate this function with a few test cases to ensure correctness: ```python assert encode_integer_to_chars(0) == chr(63) assert encode_integer_to_chars(1) == chr(63 + 1) assert encode_integer_to_chars(31) == chr(63 + 31) assert encode_integer_to_chars(int(0b1)) == chr(0b1 + 63) assert encode_integer_to_chars(int(0b1000001)) == chr(0b10 + 63) + chr(63 + 0b100001) ``` These tests cover scenarios from zero to a number requiring more than one chunk, ensuring the function operates as expected. ## Example 2 ### Instruction Create a Python function that takes two arrays `nums1` and `nums2`, and returns an array containing their intersection. Implement the function as a composition of two functions, where the first function forwards the arguments to the second function and the second function performs the actual intersection operation. The second function should calculate the intersection of the two arrays by performing arithmetic calculations on their elements. Each element in the intersection should be the result of a specific operation between the corresponding elements of the two arrays. For example, the intersection of `[1, 2, 3]` and `[4, 5, 6]` could be `[1 + 4, 2 + 5, 3 + 6] = [5, 7, 9]`. ### Response""}, {""gpt"",""Here's how you can implement the two functions: ```python def forward_arguments(nums1, nums2):  return second_function(nums1, nums2) def second_function(nums1, nums2):  result = []  for num1, num2 in zip(nums1, nums2):  result.append(num1 + num2)  return result ``` This implementation demonstrates how to define two functions, where the first function forwards the arguments to the second function, and the second function performs the actual intersection operation by performing arithmetic calculations on the elements of the two arrays.""}]"
"""4372647""","[{""human"",""Provide the best response to a given instruction. Follow the following steps to craft your response: 1. reason about the given instruction 2. provide a high-quality solution 3. offer a concise explanation 4. write tests to verify the correctness your solution ## Example 1 ### Instruction Here are two special formulas: $$ f_1(a, b) = (a + 1) \cdot (b + 1) - 1 $$ $$ f_2(k) = \begin{cases}  \frac{(k + 1)^2}{2} + k + 1 & \text{if } k \text{ is odd} \\  \frac{k^2}{2} + 2k + 1 & \text{if } k \text{ is even} \end{cases} $$ Write a Python function to return $f_2(f_1(a, b))$ for given `a` and `b`. ### Response [Reasoning] Based on the formulas you provided, we can define two Python functions, `f1(a, b)` and `f2(k)`, respectively, and then combine them to calculate $f2(f1(a, b))$ for given `a` and `b`. [Implementation] Here is the implementation: ```python def f1(a, b):  return (a + 1) * (b + 1) - 1 def f2(k):  if k % 2 == 0: # Check if k is even  return (k**2) / 2 + 2 * k + 1  else: # k is odd  return ((k + 1)**2) / 2 + k + 1 def f1_f2_combined(a, b):  return f2(f1(a, b)) ``` [Explanation] In this implementation: - `f1(a, b)` calculates the first formula you've given. - `f2(k)` calculates the second formula, with a conditional branch for whether `k` is even or odd. - `f1_f2_combined(a, b)` uses the output of `f1(a, b)` as the input for `f2(k)` and returns the result. [Tests] To test this combined function, you can simply compare it with the direct application of `f2` to the result of `f1` for different values of `a` and `b`: ```python assert f1_f2_combined(2, 3) == f2(f1(2, 3)) assert f1_f2_combined(4, 5) == f2(f1(4, 5)) assert f1_f2_combined(6, 7) == f2(f1(6, 7)) ``` ## Example 2 ### Instruction Write a Python function that takes an object `obj`, a list of attributes `attrs`, and a delimiter `delim`, and returns a string containing the values of the attributes of `obj` in sequence, separated by `delim`. If an attribute is not present, set it to None. ### Response""}, {""gpt"",""To solve this problem, you can use Python's `getattr` function to access the attributes of `obj`. Additionally, you can use a list comprehension to iterate over the attributes and retrieve their values from `obj`. Finally, you can join the retrieved values with the provided `delim` to get the desired string. Here's the implementation: ```python def get_attrs_as_string(obj, attrs, delim):  attr_values = [getattr(obj, attr, None) for attr in attrs]  return delim.join(str(value) for value in attr_values) ```""}]"
"""4372648""","[{""human"",""Provide the best response to a given instruction. Follow the following steps to craft your response: 1. reason about the given instruction 2. provide a high-quality solution 3. offer a concise explanation 4. write tests to verify the correctness your solution ## Example 1 ### Instruction Construct a Python function `create_folds(data, count)` to create a list of folds from the given data, where each fold is a subsequence of the original data. The length of each fold should be approximately equal with at most a difference of 1. The function should satisfy the following assertion: ```python assert create_folds(range(12), 2) == [range(0, 6), range(6, 12)] assert create_folds(range(8), 2) == [range(0, 4), range(4, 8)] assert create_folds(range(25), -5) == [] assert create_folds(range(6), 2) == [range(0, 3), range(3, 6)] ``` ### Response [Reasoning] To create a Python function that generates a list of folds from given data with each fold having approximately equal length (with at most a difference of 1 between any two folds), you can follow these steps: 1. Return an empty list immediately if `count` is non-positive since it's not possible to create a valid number of folds. 2. Divide the total length of the data by `count`, rounding down, to get the minimum number of items each fold should have. 3. Calculate the remainder to see how many extra items need to be distributed among the folds. They are distributed one by one to the first `remainder` folds. 4. Use a loop to create each fold, adjusting the starting and ending indices based on the calculations above to ensure each fold has the correct number of items. [Implementation] Here's how you can implement this: ```python def create_folds(data, count):  # Return an empty list if count is non-positive  if count <= 0:  return []  data_length = len(data)  fold_size = data_length // count  remainder = data_length % count  folds = []  start_index = 0  for i in range(count):  # Add 1 element to each of the first `remainder` folds  end_index = start_index + fold_size + (1 if i < remainder else 0)  folds.append(range(start_index, end_index))  start_index = end_index  return folds ``` [Explanation] This implementation ensures that each fold has an equal length with at most a difference of 1 between any two folds. It handles edge cases, such as a non-positive `count`, by returning an empty list as specified. [Tests] You can test your code with the provided assertions to verify that the function works correctly: ```python assert create_folds(range(12), 2) == [range(0, 6), range(6, 12)] assert create_folds(range(8), 2) == [range(0, 4), range(4, 8)] assert create_folds(range(25), -5) == [] assert create_folds(range(6), 2) == [range(0, 3), range(3, 6)] ``` ## Example 2 ### Instruction Create a Python function `to_json_str(dc, indent=4)` that takes a dictionary of key-value pairs and returns its JSON representation with indentation. The function should handle nested dictionaries and lists, including empty objects. ### Response""}, {""gpt"",""To create a Python function that converts a dictionary of key-value pairs to a JSON string, you can follow these steps: 1. Write a recursive function `jsonify()` that converts a nested data structure recursively to a JSON representation. 2. Handle different types of values such as dictionaries, lists, integers, and strings by determining which representation to use for each type. 3. Use the built-in `json.dumps()` function from the `json` module to convert the dictionary to a JSON string, specifying the `indent` argument to control the indentation of the output. Here's an example implementation: ```python import json def jsonify(obj, indent=4):  if isinstance(obj, dict):  return ""{"" + "", "".join([f'""{key}"": {jsonify(value, indent)}' for key, value in obj.items()]) + ""}""  elif isinstance(obj, list):  return ""["" + "", "".join([jsonify(item, indent) for item in obj]) + ""]""  elif isinstance(obj, (int, float)):  return str(obj)  elif isinstance(obj, str):  return f'""{obj}""'  else:  return ""null"" def to_json_str(dc, indent=4):  return json.dumps(jsonify(dc), indent=indent) ```""}]"
"""4372649""","[{""human"",""Provide the best response to a given instruction. Follow the following steps to craft your response: 1. reason about the given instruction 2. provide a high-quality solution 3. offer a concise explanation 4. write tests to verify the correctness your solution ## Example 1 ### Instruction Develop a Python function `ceil_divide_without_div` that computes the ceiling of `n / 2**b` without using division, floating-point operations, built-in functions like `math.ceil`, or converting it into computing the floor. Instead, use bit manipulation to achieve the result. You write assertions to ensure both `n` and `b` are non-negative integers. ### Response [Reasoning] We can leverage the properties of bitwise operations to develop a Python function that computes the ceiling of `n / 2**b` without using division, floating-point operations, or any built-in functions like `math.ceil`, and strictly avoiding converting the problem into a floor computation. In the context, dividing by `2**b` is equivalent to right-shifting `n` by `b` bits in binary representation when `n` is divisible by `2**b`. Otherwise, this approach inherently computes the floor of the division. To adjust for the ceiling without directly calculating it, we can consider two cases: 1. If `n` is divisible by `2**b`, the result of the division is simply the value of right-shifting `n` by `b` bits. 2. Otherwise, we need to increment the right-shifted result by 1 to achieve the ceiling effect. To determine the divisibility of `n` by `2**b`, we can check if the first `b` bits of `n` are all 0. This can be done using a mask with the first `b` bits set to 1, and then performing a bitwise AND operation with `n`. If the result is 0, it indicates that `n` is divisible by `2**b`. The mask can be created using `mask = (1 << b) - 1`. We'll also include assertions to ensure that `n` and `b` are non-negative integers. [Implementation] Here is the implementation of `ceil_divide_without_div`: ```python def ceil_divide_without_div(n: int, b: int) -> int:  assert isinstance(n, int) and n >= 0, ""n must be a non-negative integer""  assert isinstance(b, int) and b >= 0, ""b must be a non-negative integer""  # Constructing the mask to check divisibility  mask = (1 << b) - 1  divisible = (n & mask) == 0  result = n >> b  # If n is not divisible by 2**b, increment the result to get the ceiling  if not divisible:  result += 1  return result ``` [Explanation] This function calculates the ceiling of `n / 2**b` purely through bitwise manipulation and integer arithmetic. The assertions ensure the input constraints are met, as specified. [Tests] To verify the correctness of the function, we can run the following tests: ```python import math assert ceil_divide_without_div(10, 3) == math.ceil(10 / 2**3) assert ceil_divide_without_div(17, 4) == math.ceil(17 / 2**4) assert ceil_divide_without_div(100, 5) == math.ceil(100 / 2**5) assert ceil_divide_without_div(0, 3) == math.ceil(0 / 2**3) assert ceil_divide_without_div(1, 0) == math.ceil(1 / 2**0) ``` These tests compare the results of our function with the expected results obtained using `math.ceil` to ensure the correctness of the ceiling division operation. ## Example 2 ### Instruction Create a Python function that takes two parameters: `dim` and `max_val`. The function should create a numpy array with the shape `(dim, dim)` and fill it with random integers from 0 to `max_val`. Then, the function should loop through each row of the array and set the first element of each row to the maximum of that row. Finally, the function should return the updated array. ### Response""}, {""gpt"",""Here's the implementation for the function: ```python import numpy as np def set_first_element_to_max(dim, max_val):  arr = np.random.randint(0, max_val, size=(dim, dim))  for i in range(dim):  arr[i, 0] = np.max(arr[i, :])  return arr ```""}]"


In [14]:
def parse_json_to_string(json_data):
    # Load the JSON data if it's a string
    if isinstance(json_data, str):
        data = json.loads(json_data)
    else:
        data = json_data
    
    # Initialize the result string
    result_string = ""
    
    # Template for appending each problem and solution
    template = """
### Problem:
{human_value}

### Solution:
{gpt_value}
"""
    # Temporary storage for human and gpt values
    values = {}

    # Process each entry in the JSON data
    for entry in data:
        key = f"{entry['from']}_value"
        values[key] = entry['value']
        
        # If both human and gpt entries have been processed, append to result string
        if 'human_value' in values and 'gpt_value' in values:
            result_string += template.format(**values)
            values.clear()  # Clear the temporary storage after processing a pair

    return result_string

In [18]:
result_data = []

# Iterate through each row in the DataFrame
for row in tqdm(instruct_df.iter_rows(named=True)):
    text = parse_json_to_string(row['conversations'])
    result_data.append(text)

4372650it [00:31, 139745.77it/s]


In [25]:
print(result_data[1250000])


### Problem:
Create a chart that outlines and compares the benefits of high-intensity interval training, jogging, cycling, swimming, and dancing with regards to improving cardiovascular health. The chart should include information on the duration of the exercise, the intensity required, and the specific cardiovascular benefits of each type of exercise. Use clear headings and labels, and include relevant statistics and research findings to support your claims.

### Solution:
| Exercise Type         | Duration | Intensity Level | Cardiovascular Benefits                                                                                                                                     |
|-----------------------|----------|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
| High-Intensity Interval Training (HIIT) | 20-30 minutes | High | -Increases cardiovascular 

In [26]:
xdf = pl.DataFrame(result_data)

In [27]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "instruct", "baai-infinity-instruct.parquet"))

In [None]:
### 