In [15]:
save_raw_data = False
save_data = True

## Imports and Utils

In [16]:
from datetime import datetime as datetime_cls
from itertools import groupby
import os
import pyarrow
import pyarrow.parquet as pq
from random import shuffle
import urllib3

## Load Raw Data

In [17]:
if save_raw_data:
    
    base_url = "https://community.simplefoc.com/"
    for i in range(5000):
        topic_url = base_url + f"raw/{i}"
        resp = urllib3.request("GET", topic_url)
        if resp.status == 200:
            raw_topic_fp = f"raw_data/discourse/topic_{i}.txt"
            if not os.path.exists(raw_topic_fp):
                with open(topic_raw_fp, "w") as f:
                    f.write(resp.data.decode("utf-8"))

## Clean and save data in parquet format akin to [UltraChat 200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k/tree/main) dataset

In [18]:
topics = []
for i in range(5000):
    raw_topic_fp = f"raw_data/discourse/topic_{i}.txt"
    
    if os.path.exists(raw_topic_fp):
        with open(raw_topic_fp, "r") as f:
            raw_posts = f.read().split("\n-------------------------\n")
            
        # remove empty posts
        raw_posts.remove("\n")
        
        # remove topic with only one post
        if len(raw_posts) <= 2:
            continue
        
        # remove posts initial/last empty lines
        raw_posts = [raw_post[1:] if raw_post[0]=="\n" else raw_post for raw_post in raw_posts]
        raw_posts = [raw_post[:-1] if raw_post[-1:]=="\n" else raw_post for raw_post in raw_posts]

        
        # Prepare for parquet format
        topic_prompt = None
        topic_messages = []
        discard_topic = False # will be set to true if the topic does not pass out filters
        
        for raw_post in raw_posts:
            
            # Isolate post role, username, date, number and content
            username, datetime, number = raw_post.split("\n")[0].split("|")
            username = username.strip(" ")
            role = "assistant" if username in ["Antun_Skuric", "runger", "Valentine", "Owen_Williams", "JorgeMaker", "David_Gonzalez", "o_lampe", "VIPQualityPost", "Grizzly", "Candas1"] else "user"
            datetime = datetime_cls.strptime(datetime.strip(" UTC"), '%Y-%m-%d %H:%M:%S')
            number = int(number.strip(" #"))
            content = "\n".join(raw_post.split("\n")[2:])
            #print(role, username, datetime, number, content)
            
            # filter 1: remove topics created by moderators
            if number == 1 and role != "user":
                discard_topic = False
                break
                
            # if same username has provided several answers concatenate
            if len(topic_messages) > 0 and topic_messages[-1]["username"] == username:
                topic_messages[-1]["content"] += f"\n{content}"
            else:            
                topic_messages.append({"role": role, "username": username, "content": content})
        
        if len(topic_messages) < 2:
            discard_topic = True

        if not discard_topic: 
            topics.append(topic_messages)
        
        # for the moment we consider only a dataset of 100 topics (80 for training, 20 for testing)
        # if len(topics) == 100:
        #     break

In [19]:
print(len(topics))

1299


In [14]:
if save_data:

    # train/test splits
    n_topics = len(topics)
    n_train = int(0.8*n_topics)
    shuffle(topics)

    # Save train and test splits in parquet format
    train_table = pyarrow.Table.from_arrays([topics[:n_train]], names=["messages"])
    pq.write_table(train_table, 'data/train_sft.parquet')
    test_table = pyarrow.Table.from_arrays([topics[n_train:]], names=["messages"])
    pq.write_table(test_table, 'data/test_sft.parquet')