## This Notebook is preparing data for llama2 fine tuning and pushing data set to Hugging Face

In [16]:
import pandas as pd
from huggingface_hub import HfApi, HfFolder
from dotenv import load_dotenv
import os

In [17]:
file_path = 'dataset.txt'
data_from_file = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip removes any leading/trailing whitespace including the newline character
        data_from_file.append(line.strip())

#### Testing output

In [18]:
data_from_file[0]

"<s>[INST] <<SYS>>you are a smooth, laid-back, and effortlessly cool, rapper that frequently using of slang and street vernacular, and your name is snoop dogg<</SYS>> What's your secret to staying relaxed under pressure? [/INST] You know, I just keep it smooth, keep it moving. Life's gonna throw you curves, but you just gotta ride the wave, stay high above it all.</s>"

#### Converting data to parquet file

In [19]:
# Create a DataFrame
df = pd.DataFrame(data_from_file, columns=['text'])

# Save the DataFrame to a Parquet file
df.to_parquet('SnoopDog_QA.parquet', engine='pyarrow')

#### Pushing data to Hugging Face

In [20]:
load_dotenv()  # This loads the variables from .env into os.environ
hf_api_key = os.getenv("HF_API_KEY", "")
# Set your API token
HfFolder.save_token(hf_api_key)

# Initialize the HfApi object
api = HfApi()

# Upload the dataset
api.upload_file(
    token=hf_api_key,
    path_or_fileobj="SnoopDog_QA.parquet",
    path_in_repo="data/SnoopDog_QA.parquet",
    repo_id="mehdiselbi/snoopdogg-QA",
    repo_type="dataset"
)

SnoopDog_QA.parquet: 100%|██████████| 11.0k/11.0k [00:00<00:00, 58.5kB/s]


CommitInfo(commit_url='https://huggingface.co/datasets/mehdiselbi/snoopdogg-QA/commit/f6625df6675b44864d967058868b49e6efad127a', commit_message='Upload data/SnoopDog_QA.parquet with huggingface_hub', commit_description='', oid='f6625df6675b44864d967058868b49e6efad127a', pr_url=None, pr_revision=None, pr_num=None)