In [None]:
# Install necessary libraries
!pip install inflect
!pip install -q datasets
!pip install --upgrade huggingface_hub

In [None]:
# Import libraries
import json
import re
import inflect
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset, Dataset
from huggingface_hub import login
from google.colab import userdata

pd.set_option('display.max_colwidth', None)

In [None]:
# Authenticate with Hugging Face Hub
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [None]:
def character_repetition(text):
    """
    Limits excessive character repetitions in a given text.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: Text with excessive character and punctuation repetitions removed.
    """
    # Replace excessive letter repetitions (more than two)
    pattern_alpha = re.compile(r"([A-Za-z])\1{2,}", re.DOTALL)
    formatted_text = pattern_alpha.sub(r"\1\1", text)

    # Replace excessive punctuation repetitions (more than one)
    pattern_punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
    cleaned_text = pattern_punct.sub(r'\1', formatted_text)

    return cleaned_text

def extract_from_openai_summarize_comparisons_dataset(row):
    """
    Extracts post content, subreddit, and title from a dataset row.

    Args:
        row (pd.Series): A row from the dataset.

    Returns:
        pd.Series: A Series containing the extracted post content, subreddit, and title.
    """
    category_post = clean_text(row["prompt"]).split("POST", 1)

    # If "POST" keyword is missing, return original text with unknown labels
    if len(category_post) < 2:
        return pd.Series([row["prompt"], "unknown", "unknown"])

    # Extract the actual post content
    post = category_post[1]
    post = re.sub(r'\r', ' ', post)  # Replace carriage return characters with spaces
    post = character_repetition(post)

    # Extract subreddit and title
    raw_category = category_post[0]
    subreddit_match = re.search(r'SUBREDDIT\s+(.*?)(?=\s+TITLE)', raw_category)
    title_match = re.search(r'TITLE\s*(.*)', raw_category)

    subreddit_value = subreddit_match.group(1).strip().lower() if subreddit_match else None
    title_value = title_match.group(1).strip().lower() if title_match else None

    return pd.Series([post, subreddit_value, title_value])


In [None]:
def remove_tldr(text):
    """
    Removes 'TL;DR: ' phrase from the text.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text.
    """
    return text.replace("TL;DR: ", "")

def remove_citizens_for_the_republic(text):
    """
    Removes 'Citizens for the Republic' phrase from the text.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text.
    """
    return text.replace("Citizens for the Republic", "")

def remove_html_tags(sentence):
    """
    Removes HTML tags from the given text.

    Args:
        sentence (str): Input sentence.

    Returns:
        str: Cleaned sentence without HTML tags.
    """
    pattern = re.compile("<.*?>")
    return re.sub(pattern, '', sentence).strip()

def clean_text(text):
    """
    Cleans text by removing special characters, HTML tags, and converting numbers to words.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text.
    """
    text = remove_html_tags(text)
    text = re.sub(r'\n|\r', ' ', text)  # Replace new lines with space
    text = re.sub(r'[#]', '', text)  # Remove hash characters
    text = re.sub(r'\br/\b', '', text)  # Remove "r/" prefix
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside square brackets
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    # Convert numbers to words
    def number_to_words(match):
        p = inflect.engine()
        return p.number_to_words(int(match.group(0)))

    text = re.sub(r'\b\d+\b', number_to_words, text)

    return text

def get_from_openai_summarize_comparisons(dataset_names=None):
    """
    Loads dataset splits from Hugging Face and returns a pandas DataFrame.

    Args:
        dataset_names (list, optional): List of dataset splits (e.g., ["train", "test"]). Defaults to all.

    Returns:
        pd.DataFrame: Combined dataset with 'prompt', 'chosen', and 'rejected' columns.
    """
    if dataset_names is None:
        dataset_names = ["train", "test", "valid1", "valid2"]

    splits = {
        'train': 'data/train-00000-of-00001-3cbd295cedeecf91.parquet',
        'test': 'data/test-00000-of-00001-0845e2eec675b16a.parquet',
        'valid1': 'data/valid1-00000-of-00001-b647616a2be5f333.parquet',
        'valid2': 'data/valid2-00000-of-00001-2655c5b3621b6116.parquet'
    }

    for dataset_name in dataset_names:
        if dataset_name not in splits:
            raise ValueError(f"Invalid dataset name: {dataset_name}. Choose from {list(splits.keys())}.")

    df_raw = pd.DataFrame()
    for dataset_name in dataset_names:
        df_temp = pd.read_parquet("hf://datasets/CarperAI/openai_summarize_comparisons/" + splits[dataset_name])
        df_raw = pd.concat([df_raw, df_temp])

    return df_raw

def preprocess_openai_summarize_comparisons_dataset(df, topic_list=None):
    """
    Preprocesses the dataset by extracting structured information and cleaning the summaries.

    Args:
        df (pd.DataFrame): Input dataset.
        topic_list (list, optional): List of topics to filter. Defaults to None.

    Returns:
        pd.DataFrame: Preprocessed dataset with selected columns.
    """
    df2 = df.copy()
    df2[["prompt", "topic", "title"]] = df.apply(extract_from_openai_summarize_comparisons_dataset, axis=1)

    for column in ["chosen", "rejected"]:
        df2[column] = df2[column].apply(clean_text).apply(remove_tldr).apply(remove_citizens_for_the_republic)

    if topic_list is None:
        return df2[["prompt", "chosen", "rejected"]]

    df2 = df2[df2["topic"].isin(topic_list)]

    return df2[["prompt", "chosen", "rejected"]]

def plot_topic_distribution(df):
    """
    Plots the distribution of topics in the dataset.

    Args:
        df (pd.DataFrame): DataFrame containing a 'topic' column.
    """
    df["topic"].value_counts().plot(kind="barh", figsize=(10, 6))
    plt.title("Topic Distribution", fontsize=14)
    plt.xlabel("Frequency", fontsize=12)
    plt.ylabel("Topic", fontsize=12)
    plt.savefig("topic_distribution.png")
    plt.show()


def upload_dataset_to_huggingface(dataset, repo_name, token):
    """
    Hugging Face Hub'a Dataset yükleme fonksiyonu.

    Args:
        dataset (Dataset): Hugging Face Dataset objesi.
        repo_name (str): Hugging Face repository ismi.
        token (str): Hugging Face erişim tokeni.
        private (bool): Repository'nin public/private ayarı. Varsayılan: False (public).

    Returns:
        str: Repository URL.
    """
    # Dataset'i Hugging Face Hub'a push etmek
    dataset.push_to_hub(repo_id=repo_name)

    print(f"Dataset '{repo_name}' successfully uploaded to Hugging Face Hub!")
    return f"https://huggingface.co/datasets/{repo_name}"

In [None]:
df = get_from_openai_summarize_comparisons(["train", "valid1", "valid2"])

In [None]:
df.head()

In [None]:
df.value_counts("topic")

In [None]:
# Topic Distribution Visualization
plot_topic_distribution(df)

In [None]:
df = preprocess_openai_summarize_comparisons_dataset(df.loc[:100], ["relationships"])

In [None]:
df.head()

In [None]:
# Enter token and file information
pkl_file_path = "dataset.pkl"  # Path to the .pkl file to upload
repo_name = "username/repo_name"  # Hugging Face repository name
# Upload Dataset to Hugging Face Hub
upload_dataset_to_huggingface(df, repo_name, HF_TOKEN)