In [1]:
from datasets import load_dataset
import polars as pl
import os
import json
from tqdm import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_dir = "/workspace/fast-data/disk1/nam/modules/trainer/Samba/data/input/"

In [3]:
# Reformat bridgedict

In [4]:
def load_parquet_files(directory):
    # List to hold DataFrames
    dataframes = pl.DataFrame()
    
    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a .parquet file
        if filename.endswith(".parquet"):
            filepath = os.path.join(directory, filename)
            # Read the .parquet file using Polars
            df = pl.read_parquet(filepath)
            df = df.select(pl.col("response"), pl.col("word"))
            # Append the DataFrame to the list
            dataframes = pl.concat([dataframes, df])
    return dataframes

In [9]:
directory_path = "/workspace/fast-data/disk1/nam/modules/trainer/Samba/data/input/bridgedict-vgm"
parquet_dfs = load_parquet_files(directory_path)

In [10]:
def convert_dict_to_markdown(data):
    def list_to_markdown(title, items):
        return f"### {title}\n" + "\n".join([f"- {item}" for item in items]) + "\n"

    def section_to_markdown(title, content):
        return f"## {title}\n\n{content}\n"

    markdown = []
    try:
        data = json.loads(data)
        if "type" in data:
            markdown.append(section_to_markdown("Type", data["type"]))
        
        if "context" in data:
            markdown.append(section_to_markdown("Context", data["context"]))
        
        if "english_antonyms" in data and data["english_antonyms"]:
            markdown.append(list_to_markdown("English Antonyms", data["english_antonyms"]))
    
        for key in ["vietnamese_synonyms", "greek_synonyms", "hebrew_synonyms", "english_synonyms"]:
            if key in data and data[key]:
                language = key.split('_')[0].capitalize()
                markdown.append(list_to_markdown(f"{language} Synonyms", data[key]))
    
        for key in ["vietnamese_meaning", "greek_meaning", "hebrew_meaning", "english_meaning"]:
            if key in data and data[key]:
                language = key.split('_')[0].capitalize()
                markdown.append(list_to_markdown(f"{language} Meaning", data[key]))
    
        for key in ["example_vietnamese", "example_greek", "example_hebrew", "example_english"]:
            if key in data and data[key]:
                language = key.split('_')[1].capitalize()
                markdown.append(list_to_markdown(f"Example Sentences in {language}", data[key]))
        
        return "\n".join(markdown)
    except:
        return ""

def formatter(series):
    return [convert_dict_to_markdown(x) for x in series]

In [17]:
result_data = []

# Iterate through each row in the DataFrame
for row in tqdm(parquet_dfs.iter_rows(named=True)):
    word = row["word"]
    response = row["response"]
    formatted_response = convert_dict_to_markdown(response)
    result_data.append({"word": word, "text": formatted_response})

121569it [00:03, 34317.98it/s]


In [18]:
xdf = pl.DataFrame(result_data)

In [19]:
xdf.tail()

word,text
str,str
"""stuffed bun""","""## Type noun ## Context Thi…"
"""lack of order""","""## Type noun ## Context Thi…"
"""northern future""","""## Type noun ## Context The…"
"""petty affair""","""## Type noun ## Context Thi…"
"""clear-cut""","""## Type adjective ## Context…"


In [20]:
fdf = xdf.filter(pl.col("text").str.lengths() > 10)

  fdf = xdf.filter(pl.col("text").str.lengths() > 10)


In [21]:
fdf = fdf.select(("# " + pl.col("word") + "\n\n" + pl.col("text")).alias("text"))

In [24]:
print(fdf["text"][0])

# promotion to celestial position

## Type

noun

## Context

This word is used in the context of religious or spiritual beliefs. It conveys a positive meaning and refers to the act of being elevated to a divine or heavenly position.

### Vietnamese Meaning
- Sự thăng cấp thành vị trí thiên đình
- Sự được đưa lên chỗ thiêng liêng
- Sự được trọng dụng trong vị trí thiên sứ
- Sự lên bậc công danh trong địa vị thiêng liêng

### Greek Meaning
- <translate list of equivalent Vietnamese meanings above into Greek>

### Hebrew Meaning
- <translate list of equivalent Vietnamese meanings above into Hebrew>

### English Meaning
- The act of being promoted to a celestial or divine position
- The act of being elevated to a sacred or holy place
- The act of being appointed to a position of angelic authority
- The act of achieving a higher status within a sacred hierarchy

### Example Sentences in Vietnamese
- Anh ta được thăng cấp thành vị trí thiên sứ.
- Tôi mong muốn được đưa lên chỗ thiêng liêng.

In [26]:
fdf.write_parquet("/workspace/fast-data/disk1/nam/modules/trainer/Samba/data/input/bridgedict.parquet")

In [53]:
# a = json.loads(parquet_dfs["response"][1110])
# print(type(a))
# x = convert_dict_to_markdown(a)
# print(x)

In [58]:
bible_data = pl.read_parquet(os.path.join(input_dir, "bible", "book_paragraph_translation.parquet"))

In [59]:
result_data = []

# Iterate through each row in the DataFrame
for row in tqdm(bible_data.iter_rows(named=True)):
    if random.choice([True, False]):
        text = f"""### Vietnamese
{row['content_vi']}

### English
{row['content_en']}
"""
    else:
        text = f"""### English
{row['content_en']}

### Vietnamese
{row['content_vi']}
"""
    result_data.append({"text": text})

43886it [00:00, 53314.80it/s]


In [60]:
xdf = pl.DataFrame(result_data)

In [61]:
print(xdf["text"][222])

### English
So the Peter replied that suffering was part of the journey of faith.
It tests the true value of our disciples (1: 7), making us stick with other Christians (5: 9) and will be advocated in the day of judgment (4: 16- 19).
Although the believers are "strange guests" and "dissolved" in this world (1: 1), they are part of the people of God pilgrimage (2: 5.9) towards the house My father (1: 4).
They look forward to the day that Jesus returned because those who belong to Him (1: 7 2:12 5: 4).
These are truths that make Christians today have been urged to live for God's glory, as they have encouraged non-e-reputable readers.
Peter wrote a letter as a person in his heart who didn't lose the love fire but the Lord himself grouped at the coast of Ti-riery (comparing GI 21: 1,15-19 with IPHI 1: 8).
This letter contained all vividness in a person's personal recollection of Jesus.
Please also see the article about how to read messages.
Read more, I and II Peter
E. P. Clowney, The Mess

In [62]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "bible", "vietbible_envi.parquet"))

### Bible Text

In [50]:
bible_data = pl.read_parquet(os.path.join(input_dir, "bible", "bible_raw.parquet"))
bible_data.tail()

language,version,version_text,book_name,book_text,vb_code,osis_code,chapter,text,verses
str,str,str,str,str,str,str,str,str,list[str]
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zech""","""Xa-cha-ri""","""Xa""","""Zech""","""8""","""# Kinh Thánh Version: Bản Truy…","[""Xa-cha-ri 8:1 `Lại có lời của Đức Giê-hô-va vạn quân phán cùng ta rằng:` (Xa 8:1)"", ""Xa-cha-ri 8:2 `Đức Giê-hô-va vạn quân có phán như vầy: Ta đã nổi ghen vì Si-ôn bởi một cơn ghen lớn, ta đã nổi ghen vì nó bởi cơn tức giận lớn.` (Xa 8:2)"", … ""Xa-cha-ri 8:23 `Đức Giê-hô-va vạn quân phán như vầy: Sẽ xảy ra trong những ngày đó, có mười người từ mọi thứ tiếng trong các nước ra, nắm chặt vạt áo của một người Giu-đa, mà nói rằng: Chúng ta sẽ đi cùng các ngươi, vì chúng ta có nghe rằng Đức Giê-hô-va ở cùng các ngươi.` (Xa 8:23)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zech""","""Xa-cha-ri""","""Xa""","""Zech""","""9""","""# Kinh Thánh Version: Bản Truy…","[""Xa-cha-ri 9:1 `Gánh nặng lời Đức Giê-hô-va nghịch cùng đất Ha-đơ-rắc, nó sẽ đỗ trên Đa-mách: vì con mắt loài người và mọi chi phái Y-sơ-ra-ên đều ngó lên Đức Giê-hô-va.` (Xa 9:1)"", ""Xa-cha-ri 9:2 `Lời ấy cũng phán về Ha-mát, láng giềng của Đa-mách, về Ty-rơ và Si-đôn, vì các thành ấy là rất khôn sáng.` (Xa 9:2)"", … ""Xa-cha-ri 9:17 `Sự nhân từ Ngài và sự tốt đẹp Ngài sẽ lớn là dường nào! Lúa miến sẽ làm cho những trai trẻ lớn lên, và rượu mới sẽ làm cho gái đồng trinh thạnh vượng.` (Xa 9:17)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zeph""","""Sô-phô-ni""","""So""","""Zeph""","""1""","""# Kinh Thánh Version: Bản Truy…","[""Sô-phô-ni 1:1 `Nầy là lời Đức Giê-hô-va phán cùng Sô-phô-ni, con trai Cu-si, cháu Ghê-đa-lia, chắt A-ma-ria, chít Ê-xê-chia, về đời Giô-si-a, con trai A-môn, vua Giu-đa.` (So 1:1)"", ""Sô-phô-ni 1:2 `Ta sẽ diệt sạch mọi sự khỏi mặt đất, Đức Giê-hô-va phán vậy.` (So 1:2)"", … ""Sô-phô-ni 1:18 `Hoặc bạc hoặc vàng của chúng nó, đều không có thể giải cứu chúng nó trong ngày thạnh nộ của Đức Giê-hô-va; nhưng cả đất nầy sẽ bị lửa ghen Ngài thiêu nuốt; vì Ngài sẽ diệt hết dân cư đất nầy cách thình lình.` (So 1:18)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zeph""","""Sô-phô-ni""","""So""","""Zeph""","""2""","""# Kinh Thánh Version: Bản Truy…","[""Sô-phô-ni 2:1 `Hỡi dân chẳng biết xấu hổ! Hãy nhóm hiệp lại, phải, hãy nhóm hiệp lại,` (So 2:1)"", ""Sô-phô-ni 2:2 `trước khi mạng lịnh chưa ra, ngày giờ chưa qua như trấu, trước khi sự nóng giận của Đức Giê-hô-va chưa đến trên các ngươi, trước khi ngày thạnh nộ của Đức Giê-hô-va chưa đến trên các ngươi.` (So 2:2)"", … ""Sô-phô-ni 2:15 `Kìa, thành vui vẻ nầy đã ở yên không lo lắng và nói trong lòng mình rằng: Ta đây, ngoài ta không còn ai hết! Nó đã trở nên hoang vu, làm chỗ nằm cho loài thú vật là dường nào! Phàm kẻ đi qua sẽ khoa tay mà nhạo cười!` (So 2:15)""]"
"""vietnamese""","""VIEVTT""","""Bản Truyền Thống 1926""","""Zeph""","""Sô-phô-ni""","""So""","""Zeph""","""3""","""# Kinh Thánh Version: Bản Truy…","[""Sô-phô-ni 3:1 `Khốn thay cho thành bạn nghịch và ô uế, làm sự bạo ngược!` (So 3:1)"", ""Sô-phô-ni 3:2 `Nó không nghe lời; không chịu sửa dạy; không nhờ cậy Đức Giê-hô-va; không đến gần Đức Chúa Trời mình.` (So 3:2)"", … ""Sô-phô-ni 3:20 `Trong lúc đó, ta sẽ đem các ngươi trở về, trong lúc đó ta sẽ nhóm các ngươi lại; vì ta sẽ làm cho các ngươi nổi danh tiếng và được khen lao giữa mọi dân trên đất, khi ta đem phu tù các ngươi về trước mặt các ngươi, Đức Giê-hô-va có phán vậy.` (So 3:20)""]"


In [51]:
xdf = bible_data.select(pl.col("text"))
xdf.tail()

text
str
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"
"""# Kinh Thánh Version: Bản Truy…"


In [52]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "bible", "multiple_bible_raw.parquet"))

### Translation Data

In [27]:
e2v = load_dataset("nampdn-ai/e2v-v2e")

Downloading data: 100%|██████████| 882k/882k [00:00<00:00, 1.06MB/s]
Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 2.61MB/s]
Downloading data: 100%|██████████| 3.17M/3.17M [00:00<00:00, 3.69MB/s]
Downloading data: 100%|██████████| 123M/123M [00:04<00:00, 27.9MB/s] 
Downloading data: 100%|██████████| 882k/882k [00:00<00:00, 938kB/s]
Downloading data: 100%|██████████| 1.50M/1.50M [00:00<00:00, 1.71MB/s]
Downloading data: 100%|██████████| 2.46M/2.46M [00:01<00:00, 1.93MB/s]
Downloading data: 100%|██████████| 3.17M/3.17M [00:01<00:00, 2.92MB/s]
Downloading data: 100%|██████████| 23.4M/23.4M [00:01<00:00, 14.3MB/s]
Generating train split: 100%|██████████| 48501/48501 [00:01<00:00, 31363.03 examples/s]


In [28]:
e2v_df = pl.DataFrame(e2v['train'].to_dict('records'))

In [31]:
e2v_df.write_parquet(os.path.join(input_dir, "_pretrain", "core", "e2v-v2e.parquet"))

## MATH

In [63]:
math = load_dataset("microsoft/orca-math-word-problems-200k")

Downloading readme: 100%|██████████| 6.91k/6.91k [00:00<00:00, 15.8MB/s]
Downloading data: 100%|██████████| 84.2M/84.2M [00:02<00:00, 29.9MB/s]
Generating train split: 100%|██████████| 200035/200035 [00:00<00:00, 218592.55 examples/s]


In [64]:
math_df = pl.DataFrame(math['train'].to_dict('records'))

In [65]:
math_df.tail()

question,answer
str,str
"""Adult tickets for a show cost …","""Let's denote the number of adu…"
"""Your cell phone company offers…","""Let's denote the number of tex…"
"""Steve invests in a circus prod…","""First, let's calculate the ave…"
"""Your teacher is giving a test …","""Let's denote the number of 5-p…"
"""A plane flying with a tailwind…","""Let's denote the plane's airsp…"


In [69]:
xdf = math_df.select(
    ("### Problem:\n" + pl.col("question") + "\n\n### Solution:\n" + pl.col("answer")).alias("text")
)
xdf.tail()

text
str
"""### Problem: Adult tickets for…"
"""### Problem: Your cell phone c…"
"""### Problem: Steve invests in …"
"""### Problem: Your teacher is g…"
"""### Problem: A plane flying wi…"


In [70]:
print(xdf['text'][100])

### Problem:
A barrel contains 12 liters (L) and 400 milliliters (㎖) of petroleum, and B barrel contains 7600 milliliters (㎖) of petroleum. How many liters (L) must be moved from A barrel to B barrel to equalize the amount of petroleum in A barrel and B barrel?

### Solution:
First, let's convert all the measurements to the same unit to make the calculation easier. We'll convert everything to milliliters (㎖).

A barrel contains 12 liters and 400 milliliters. Since 1 liter is equal to 1000 milliliters, we can convert the 12 liters to milliliters:
12 liters = 12 * 1000 milliliters = 12000 milliliters

Now, add the 400 milliliters that are already in milliliters:
12000 milliliters + 400 milliliters = 12400 milliliters

So, A barrel contains a total of 12400 milliliters of petroleum.

B barrel contains 7600 milliliters of petroleum.

To equalize the amount of petroleum in both barrels, we need to find the average of the two amounts:
Total amount in both barrels = 12400 milliliters (A barre

In [71]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "math_word_problem.parquet"))

In [32]:
tinymath = load_dataset("nampdn-ai/tiny-math-textbooks")

Repo card metadata block was not found. Setting CardData to empty.


In [33]:
tinymath_df = pl.DataFrame(tinymath['train'].to_dict('records'))

In [34]:
xdf = tinymath_df.select(
    pl.col("response").alias("text")
)
xdf.tail()

text
str
"""# Probability - Casino Probabi…"
"""# Calculus in a Nutshell: Inte…"
"""# Textbook: Beautiful Geometry…"
"""# Textbook ## Chapter 1: Intr…"
"""# Textbook - Number Theory: Re…"


In [35]:
print(xdf['text'][100])

# Advanced Integration - Integration in the World

## Motivation:
Welcome to the fascinating world of advanced integration! As a mid-level manager teacher, you are constantly faced with challenging situations that require critical thinking and problem-solving skills. By mastering the concepts of integral calculus, you will enhance your ability to analyze and tackle complex problems in various domains. Whether it's analyzing data trends, optimizing resource allocation, or predicting outcomes, integration plays a vital role in the world around us. This textbook will empower you with the knowledge and skills needed to excel in your role and make informed decisions. Remember, as mentioned in Bible Roman 11:36, "For from him and through him and to him are all things." Every knowledge you acquire serves a unique purpose, and the world of integration is no exception. Let's dive in!

## Chapter 1: Fundamental Theorems of Integration
 
### Theoretical Concepts:
- The first fundamental theorem o

In [16]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "math", "tiny-math.parquet"))

## Code

In [5]:
code = load_dataset("nampdn-ai/tiny-codes")

In [6]:
code_df = pl.DataFrame(code['train'].to_dict('records'))

In [7]:
code_df.tail()

prompt,main_topic,subtopic,adjective,action_verb,scenario,target_audience,programming_language,common_sense_topic,idx,response
str,str,str,str,str,str,str,str,str,i64,str
"""Design a Java function snippet…","""Weather""","""Weather Forecast""","""Low""","""Display""","""for Decision Making""","""Beginners""","""Java""","""Reliability""",1117266,"""Here is a possible implementat…"
"""Create a relation database and…","""Bathing routine""","""Caring for Hair and Skin""","""Extreme""","""Calculate""","""for Analysis""","""Beginners""","""relation database and SQL""","""Ethics""",1042904,"""Here is a sample relational da…"
"""Build a Go module snippet that…","""Hygiene during illness""","""Hand Hygiene when Sick""","""Low""","""Calculate""","""for Decision Making""","""Experts""","""Go""","""Privacy""",804392,"""Here is a sample implementatio…"
"""Write a C# code snippet to Dis…","""Sneezing and coughing etiquett…","""Hand Hygiene After Sneezing or…","""High""","""Display""","""for Engineer""","""Beginners""","""C#""","""Bias""",1338433,"""Here is some sample code writt…"
"""Create a Ruby script snippet t…","""Weather""","""Wind Speed""","""Low""","""Check""","""for Decision Making""","""Professionals""","""Ruby""","""Bias""",1763917,"""Here is a possible implementat…"


In [8]:
xdf = code_df.select(
    ("### Problem:\n" + pl.col("prompt") + "\n\n### Solution:\n" + pl.col("response")).alias("text")
)
xdf.tail()

text
str
"""### Problem: Design a Java fun…"
"""### Problem: Create a relation…"
"""### Problem: Build a Go module…"
"""### Problem: Write a C# code s…"
"""### Problem: Create a Ruby scr…"


In [9]:
print(xdf["text"][5555])

### Problem:
Design a Bash function snippet to Check Extreme Handling personal items: Sanitizing Frequently Touched Items for Decision Making for Beginners. Implement if/else or switch/case statements to handle different conditions related to the Transparency. Provide detailed comments explaining your control flow and the reasoning behind each decision.

### Solution:
Here is a bash function I designed called "check_extreme_handling" which sanitizes frequently touched items based off of their transparency level (0 = opaque, 1=translucent, 2=transparent). The function uses an if statement to check whether an item has a specific property before sanitizing it. If no condition matches, then the default action will occur. Please let me know what else can i do to help! 

```bash
# Checks extreme handling personal items by sanitization frequency and transparency levels
function check_extreme_handling() {
  # Declare local variables
  local freqTransparency=(
    'toothbrush:0' 
    'razor:1' 

In [10]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "code", "tiny-codes.parquet"))

## WebInstruct QA

In [18]:
instruct = load_dataset("TIGER-Lab/WebInstructSub")

In [19]:
instruct_df = pl.DataFrame(instruct['train'].to_dict('records'))

In [20]:
instruct_df.tail()

orig_question,orig_answer,question,answer,source,index
str,str,str,str,str,i64
"""SMPS Transformer Primary Induc…","""At the lowest bus voltage, the…","""How does the primary inductanc…","""In designing an SMPS, primary …","""stackexchange""",2335215
"""STM32 HAL_UART_Receive_IT does…","""When characters are received, …","""Why doesn't the STM32 HAL_UART…","""The STM32 HAL_UART_Receive_IT …","""stackexchange""",2335216
"""Radius of Star, The Schwarzsch…","""Let $R_{ab}(x)$ be the Ricci t…","""1. How is the validity of the …","""1. The Schwarzschild solution,…","""stackexchange""",2335217
"""New physics at high energies, …","""JEM-EUSO is designed to look f…","""Are there any particle detecto…","""The Extreme Universe Space Obs…","""stackexchange""",2335218
"""Statistical Mechanics: Computi…","""In the second case $$\Omega = …","""How do you compute the microst…","""The microstate multiplicity re…","""stackexchange""",2335219


In [21]:
xdf = instruct_df.select(
    ("### Problem:\n" + pl.col("question") + "\n\n### Solution:\n" + pl.col("answer")).alias("text")
)
xdf.tail()

text
str
"""### Problem: How does the prim…"
"""### Problem: Why doesn't the S…"
"""### Problem: 1. How is the val…"
"""### Problem: Are there any par…"
"""### Problem: How do you comput…"


In [24]:
print(xdf["text"][2323232])

### Problem:
How do I apply Kirchhoff's Voltage Law (KVL) to solve circuits with multiple voltage sources?

### Solution:
When solving circuits with multiple voltage sources using Kirchhoff's Voltage Law (KVL), it's essential to understand that the voltage around any closed loop must sum to zero. Here's a clarification on the method your lecturer used:

1. **Loop Direction**: Choose a direction to traverse the loop. You can go clockwise or counterclockwise, but be consistent.

2. **Voltage Signs**: As you move around the loop, assign the voltage across each resistor based on the direction of the current. If the current flows from the positive to the negative terminal of the resistor, the voltage drop is negative. If it flows from negative to positive, the voltage rise is positive.

3. **Loop Equations**: For each loop, write a KVL equation. As you proceed around the loop, add the voltage sources (taking into account their polarity) and subtract the voltage drops across the resistors.



In [25]:
xdf.write_parquet(os.path.join(input_dir, "_pretrain", "instruct", "web-instruct.parquet"))