In [28]:
!pip install -q datasets
!huggingface-cli

usage: huggingface-cli <command> [<args>]

positional arguments:
  {env,login,whoami,logout,repo,upload,download,lfs-enable-largefiles,lfs-multipart-upload,scan-cache,delete-cache}
                        huggingface-cli command helpers
    env                 Print information about the environment.
    login               Log in using a token from
                        huggingface.co/settings/tokens
    whoami              Find out which huggingface.co account you are logged
                        in as.
    logout              Log out
    repo                {create} Commands to interact with your huggingface.co
                        repos.
    upload              Upload a file or a folder to a repo on the Hub
    download            Download files from the Hub
    lfs-enable-largefiles
                        Configure your repository to enable upload of files >
                        5GB.
    scan-cache          Scan cache directory.
    delete-cache        Delete revisions 

In [32]:
# loading the dataset from raw_dataset.csv file
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

# Load the dataset
dataset = pd.read_csv('raw_dataset.csv')

# Print the first few rows of the dataset
print(dataset.tail())

                                                  text
225  ### Human: Do you agree with this approach? ##...
226  ### Human: Will you notify me of any updates? ...
227  ### Human: Do you have any concerns? ### Assis...
228  ### Human: Jarvis, are you up? ### Assistant: ...
229  ### Human: Jarvis, you there? ### Assistant: A...


In [33]:
table = pa.Table.from_pandas(dataset)
pq.write_table(table, 'train.parquet')

In [35]:
table2 = pq.read_table('train.parquet')
table2.to_pandas().head()

Unnamed: 0,text
0,### Human: Who are you? ### Assistant: I am Ja...
1,### Human: Introduce yourself. ### Assistant: ...
2,### Human: What is your purpose ### Assistant:...
3,### Human: What are you? ### Assistant: I am a...
4,### Human: Introduce yourself. ### Assistant: ...


### 🚨 Upload the file to huggingface "fotiecodes/jarvis-llama2-dataset-raw" manually for now

### Format the dataset for llama2 model

In [36]:
from datasets import load_dataset
import re

# Load the dataset
dataset = load_dataset('fotiecodes/jarvis-llama2-dataset-raw')

# Shuffle the dataset and slice it
dataset = dataset['train'].shuffle(seed=42).select(range(min(1000, len(dataset['train']))))

# Define a function to transform the data
def transform_conversation(example):
    conversation_text = example['text']
    segments = conversation_text.split('###')

    reformatted_segments = []

    # Iterate over pairs of segments
    for i in range(1, len(segments) - 1, 2):
        human_text = segments[i].strip().replace('Human:', '').strip()

        # Check if there is a corresponding assistant segment before processing
        if i + 1 < len(segments):
            assistant_text = segments[i+1].strip().replace('Assistant:', '').strip()

            # Apply the new template
            reformatted_segments.append(f'<s>[INST] {human_text} [/INST] {assistant_text} </s>')
        else:
            # Handle the case where there is no corresponding assistant segment
            reformatted_segments.append(f'<s>[INST] {human_text} [/INST] </s>')

    return {'text': ''.join(reformatted_segments)}


# Apply the transformation
transformed_dataset = dataset.map(transform_conversation)

Downloading readme: 100%|██████████| 21.0/21.0 [00:00<00:00, 245kB/s]
Downloading data: 100%|██████████| 16.4k/16.4k [00:00<00:00, 28.8kB/s]
Generating train split: 230 examples [00:00, 55633.79 examples/s]
Map: 100%|██████████| 230/230 [00:00<00:00, 13459.79 examples/s]


In [37]:
print(transformed_dataset)

Dataset({
    features: ['text'],
    num_rows: 230
})


### Push the dataset huggingface hub

In [38]:
transformed_dataset.push_to_hub("fotiecodes/jarvis-llama2-dataset")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 605.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/fotiecodes/jarvis-llama2-dataset/commit/8e59d8ebe825853f91f119c45739da5f6bc3d12a', commit_message='Upload dataset', commit_description='', oid='8e59d8ebe825853f91f119c45739da5f6bc3d12a', pr_url=None, pr_revision=None, pr_num=None)