In [1]:
import os
os.chdir("../data")
print("Current working directory:", os.getcwd())

Current working directory: d:\WORK\Personal\Multi-Model-LLM-Router\data


In [2]:
import os
import json
import csv
import re
import random
import spacy
import pandas as pd

# Constants
SLEEP_DATA_FILE = "raw/training_qna_sleep.json"
CAR_DATA_FILE = "raw/training_qna_car.json"
OUTPUT_FOLDER = "processed"
TRAIN_SLEEP_FILE = "train_sleep.csv"
TRAIN_CAR_FILE = "train_car.csv"
TEST_SLEEP_FILE = "test_sleep.csv"
TEST_CAR_FILE = "test_car.csv"
TRAIN_RATIO = 0.9

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading en_core_web_sm model...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


def clean_text(text: str) -> str:
    return " ".join(token.text for token in nlp(text) if not token.is_space)


def process_json_file(json_file, output_folder, train_file, test_file, train_ratio):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Load data
    with open(json_file, "r") as f:
        data = json.load(f)

    # Shuffle the data
    random.shuffle(data["qna"])

    # Split data into training and testing sets
    split_index = int(len(data["qna"]) * train_ratio)
    train_data = data["qna"][:split_index]
    test_data = data["qna"][split_index:]

    def format_data(data):
        formatted = []
        for item in data:
            question = clean_text(item["question"])
            answer = clean_text(item["answer"])
            messages = [
                {"role": "user", "content": question},
                {"role": "assistant", "content": answer},
            ]
            formatted.append({"messages": messages})
        return formatted

    def save_to_csv(data, csv_file):
        with open(
            os.path.join(output_folder, csv_file),
            mode="w",
            newline="",
            encoding="utf-8",
        ) as file:
            writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL, escapechar="\\")
            writer.writerow(["messages"])
            for row in data:
                writer.writerow([json.dumps(row, ensure_ascii=False)])

    # Format and save training and testing data
    save_to_csv(format_data(train_data), train_file)
    save_to_csv(format_data(test_data), test_file)

    print(f"Training data saved to {os.path.join(output_folder, train_file)}.")
    print(f"Testing data saved to {os.path.join(output_folder, test_file)}.")


# Process sleep and car files separately
process_json_file(
    SLEEP_DATA_FILE, OUTPUT_FOLDER, TRAIN_SLEEP_FILE, TEST_SLEEP_FILE, TRAIN_RATIO
)
process_json_file(
    CAR_DATA_FILE, OUTPUT_FOLDER, TRAIN_CAR_FILE, TEST_CAR_FILE, TRAIN_RATIO
)

Training data saved to processed\train_sleep.csv.
Testing data saved to processed\test_sleep.csv.
Training data saved to processed\train_car.csv.
Testing data saved to processed\test_car.csv.


In [3]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer
import numpy as np
from scipy import stats
import re

# Load the data
train_sleep = pd.read_csv("processed/train_sleep.csv")
train_car = pd.read_csv("processed/train_car.csv")

# Combine the datasets
all_data = pd.concat([train_sleep, train_car])

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

def extract_text(row):
    try:
        messages = json.loads(row)["messages"]
        return " ".join([clean_text(msg["content"]) for msg in messages])
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Problematic JSON: {row}")
        return ""


# Apply text extraction and get token lengths
all_data["text"] = all_data["messages"].apply(extract_text)
all_data["token_length"] = all_data["text"].apply(lambda x: len(tokenizer.encode(x)))

# Filter out rows with empty text
all_data = all_data[all_data["text"] != ""]

# Plot token length distribution
plt.figure(figsize=(12, 6))
sns.histplot(all_data["token_length"], kde=True)
plt.title("Distribution of Token Lengths")
plt.xlabel("Token Length")
plt.ylabel("Frequency")
plt.savefig("token_length_distribution.png")
plt.close()

# Plot normal probability plot
plt.figure(figsize=(12, 6))
stats.probplot(all_data["token_length"], dist="norm", plot=plt)
plt.title("Normal Probability Plot of Token Lengths")
plt.savefig("token_length_qq_plot.png")
plt.close()

# Calculate and print statistics
mean_length = np.mean(all_data["token_length"])
median_length = np.median(all_data["token_length"])
std_dev = np.std(all_data["token_length"])

print(f"Mean token length: {mean_length:.2f}")
print(f"Median token length: {median_length:.2f}")
print(f"Standard deviation of token length: {std_dev:.2f}")

# Plot box plot
plt.figure(figsize=(12, 6))
sns.boxplot(x=all_data["token_length"])
plt.title("Box Plot of Token Lengths")
plt.xlabel("Token Length")
plt.savefig("token_length_boxplot.png")
plt.close()

# Create a summary DataFrame
summary = pd.DataFrame(
    {
        "Statistic": ["Mean", "Median", "Standard Deviation"],
        "Value": [mean_length, median_length, std_dev],
    }
)

# Save summary to CSV
summary.to_csv("token_length_summary.csv", index=False)

print("Plots and summary have been saved.")

  from .autonotebook import tqdm as notebook_tqdm


Error decoding JSON: Expecting ',' delimiter: line 1 column 430 (char 429)
Problematic JSON: {"messages": [{"role": "user", "content": "How does sleep quality affect cognitive function , memory consolidation , and learning capacity ?"}, {"role": "assistant", "content": "Sleep quality affects cognitive function , memory consolidation , and learning capacity in several ways . A good sleep quality , particularly slow - wave sleep , is considered important for memory consolidation , which is sometimes referred to as \\" sleep - dependent memory processing . \\" Impaired memory consolidation has been seen in individuals with primary insomnia , who do not perform as well as those who are healthy in memory tasks following a period of sleep . Slow - wave sleep improves declarative memory , including semantic and episodic memory . On the other hand , chronic sleep deficiency or insomnia can lead to high anxiety , irritability , erratic behavior , poor cognitive functioning and performance , and

In [4]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the Hugging Face token from the environment variable
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("Please set the HF_TOKEN environment variable in the .env file")

# Save the token to the Hugging Face folder
HfFolder.save_token(hf_token)

# Define the file paths
train_sleep_file = "processed/train_sleep.csv"
test_sleep_file = "processed/test_sleep.csv"
train_car_file = "processed/train_car.csv"
test_car_file = "processed/test_car.csv"

# Function to load and convert CSV to Dataset
def load_dataset(train_file, test_file):
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    return DatasetDict({"train": train_dataset, "test": test_dataset})


# Load the sleep and car datasets
sleep_dataset = load_dataset(train_sleep_file, test_sleep_file)
car_dataset = load_dataset(train_car_file, test_car_file)

# Push the sleep dataset to Hugging Face Hub
sleep_dataset.push_to_hub("thinkersloop/sleep-dataset-llm", private=False)
print("Sleep dataset successfully pushed to the Hugging Face Hub!")

# Push the car dataset to Hugging Face Hub
car_dataset.push_to_hub("thinkersloop/car-dataset-llm", private=False)
print("Car dataset successfully pushed to the Hugging Face Hub!")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 997.69ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<?, ?ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]


Sleep dataset successfully pushed to the Hugging Face Hub!


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1001.27ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 999.83ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.81it/s]


Car dataset successfully pushed to the Hugging Face Hub!
