In [1]:
from datasets import load_dataset
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter

import model.utils as utils
import os
import glob

In [3]:
def create_context(dataset, qst_col, answ_col, max_chars=800) -> pd.DataFrame:
    dataset_df = pd.DataFrame(dataset)
    # Concatenate question and answer into one string with a space in between
    combined = dataset_df[qst_col] + " " + dataset_df[answ_col]

    # Create a mask where the length of the combined string is less than max_chars
    mask = combined.str.len() < max_chars

    # only keep the rows that have less than max_chars
    dataset_df = combined[mask]
    dataset_df = dataset_df.reset_index(drop=True)

    # return the list
    return dataset_df.tolist()


In [5]:
# Load the metamath dataset
metamath_dataset = load_dataset("meta-math/MetaMathQA")
metamath_database = create_context(metamath_dataset["train"], "original_question", "response")
metamath_database


0         Gracie and Joe are choosing numbers on the com...
1         The treasurer of a football team must buy equi...
2         Convert $10101_3$ to a base 10 integer. $10101...
3         Sue works in a factory and every 30 minutes, a...
4         Evan’s dog weighs 63 pounds; it weighs 7 times...
                                ...                        
274503    For a given positive integer $n > 2^3$, what i...
274504    Yesterday, David and William were invited to a...
274505    Suppose $\sin N = \frac{2}{3}$ in the diagram ...
274506    Jeff orders a Halloween costume.  He has to pu...
274507    The average age of the 10 females in a choir i...
Length: 274508, dtype: object

In [9]:
# what is the memory usage of the metamath database
metamath_database = pd.DataFrame(metamath_database)
metamath_database.memory_usage(deep=True).sum()
print(" Memory usage of the metamath database in Mo: ", metamath_database.memory_usage(deep=True).sum() / 10**6)

 Memory usage of the metamath database in Mo:  162.230185


In [12]:
# Load camel-ai/physics
physics_dataset = load_dataset("camel-ai/physics")
physics_database = create_context(physics_dataset["train"], "message_1", "message_2")
# print length
print("Length of the physics database: ", len(physics_database))
# print memory usage in Mo
physics_database = pd.DataFrame(physics_database)
print(" Memory usage of the physics database in Mo: ", physics_database.memory_usage(deep=True).sum() / 10**6)

Generating train split: 100%|██████████| 20000/20000 [2:50:21<00:00,  1.96 examples/s]     


Length of the physics database:  446
 Memory usage of the physics database in Mo:  0.470144


In [10]:
# Load Programming-Language/codeagent-python
python_dataset = load_dataset("Programming-Language/codeagent-python")
python_database = create_context(python_dataset["train"], "prompt", "response")
python_database
# print memory usage in Mo
python_database = pd.DataFrame(python_database)
print(" Memory usage of the python database in Mo: ", python_database.memory_usage(deep=True).sum() / 10**6)

 Memory usage of the python database in Mo:  12.433614


In [11]:
# Looad elfonthefly/STEM_DPO
stem_dataset = load_dataset("elfonthefly/STEM_DPO")
stem_database = create_context(stem_dataset["train"], "prompt", "chosen")
# print length
print("Length of the stem database: ", len(stem_database))
# print memory usage in Mo
stem_database = pd.DataFrame(stem_database)
print(" Memory usage of the stem database in Mo: ", stem_database.memory_usage(deep=True).sum() / 10**6)

Length of the stem database:  354319
 Memory usage of the stem database in Mo:  232.720332
