In [27]:
import pandas as pd
import re

def format_instructions(text):
    lines = text.split('\n')
    if lines[0].startswith('##'):
        lines = lines[1:]  # Remove the first line

    text = '\n'.join(lines)

    text = re.sub(r'Usuari:', 'User:', text)
    text = re.sub(r'IA:', 'AI:', text)

    text = re.sub(r'>*\d+\.\s*User:\s*', 'User: ', text)
    text = re.sub(r'\**User:\s*\**', 'User: ', text)
    text = re.sub(r'\**AI:\s*\**', 'AI: ', text) 

    text = re.sub(r'---', '', text) 

    return text

# Load the CSV file into a DataFrame
df = pd.read_csv('revised_instructs_ultra.csv')

# Apply the format_instructions function to the 'Instructions' column
df['Instructions'] = df['Instructions'].apply(format_instructions)

# Save the formatted DataFrame back to a CSV file



In [7]:
df.shape

(4535, 3)

In [28]:
import numpy as np

def drop_half(df, name):
    list_gen_entries = df[df['Type'] == name]
    num_to_drop = len(list_gen_entries) // 2
    indices_to_drop = np.random.choice(list_gen_entries.index, size=num_to_drop, replace=False)
    df = df.drop(indices_to_drop)
    return df

df = drop_half(df, 'List generation')
df = drop_half(df, 'Open QA')
df = drop_half(df, 'Format Following')
df = drop_half(df, 'Learning and Educational Resources')
df = drop_half(df, 'Step-by-Step Guidance')
df = drop_half(df, 'Brainstorming')
df = drop_half(df, 'Meta Reasoning')
df = drop_half(df, 'Specific Constraints')
df = drop_half(df, 'Plan Creation')
df = drop_half(df, 'Generation')
df = drop_half(df, 'Question Generation')

In [29]:
df.shape

(4164, 3)

In [30]:
df2 = pd.read_csv('revised_instructs2.csv')  # The dataset from which you'll select entries

# Define the list of types to keep
types_to_keep = ['Explanation', 'Time-Bound', 'Math World Problems', 'Factual Recall', 
                 'Chat', 'Translation', 'Creative writing', 'Code Generation', 
                 'Closed QA', 'Semantics Questions', 'Jokes and Riddles']

# Filter df2 for rows with these types
filtered_df2 = df2[df2['Type'].isin(types_to_keep)]

# Handle "Generation" type separately, selecting half of those rows randomly
generation_entries = df2[df2['Type'] == 'Generation']
num_to_add = len(generation_entries) // 2  # Select half of the "Generation" entries
generation_sample = generation_entries.sample(n=num_to_add, random_state=42)  # Random sample

# Concatenate the filtered df2 (with "types_to_keep") and half of the "Generation" entries
final_filtered_df2 = pd.concat([filtered_df2, generation_sample])

# Append the filtered entries from df2 to df1
combined_df = pd.concat([df, final_filtered_df2], ignore_index=True)



In [31]:
# Save the combined DataFrame to a new CSV file
combined_df.to_csv('combined_dataset.csv', index=False)

In [32]:
import pandas as pd
import re

# Load the CSV file into a DataFrame
df = pd.read_csv('combined_dataset.csv')

# Function to split interactions into Prompt (User) and Completion (AI)
def split_interactions(text):
    interactions = re.split(r'(User:|AI:)', text)  # Split by User: and AI:
    user_ai_pairs = []
    current_prompt = ""
    current_completion = ""
    
    for i in range(1, len(interactions), 2):
        if interactions[i].strip() == 'User:':
            if current_prompt and current_completion:
                user_ai_pairs.append([current_prompt.strip(), current_completion.strip()])
            current_prompt = interactions[i + 1].strip()
            current_completion = ""
        elif interactions[i].strip() == 'AI:':
            current_completion = interactions[i + 1].strip()

    # Add the last pair if exists
    if current_prompt and current_completion:
        user_ai_pairs.append([current_prompt.strip(), current_completion.strip()])

    return user_ai_pairs

# Create a new DataFrame to store the split interactions
new_data = []

for index, row in df.iterrows():
    interactions = split_interactions(row['Instructions'])
    for prompt, completion in interactions:
        new_row = row.copy()  # Copy the original row to keep other columns the same
        new_row['prompt'] = prompt
        new_row['completion'] = completion
        new_data.append(new_row)

# Create a new DataFrame with the split interactions
new_df = pd.DataFrame(new_data)

# Drop the original 'Instructions' column as we now have 'Prompt (User)' and 'Completion (AI)'
new_df = new_df.drop(columns=['Instructions'])

# Save the new DataFrame to a CSV file
new_df.to_csv('prompts_instructs_ultra.csv', index=False)


In [16]:
new_df = new_df.dropna(subset=['prompt'])
new_df = new_df.dropna(subset=['completion'])

In [17]:
new_df.shape

(6625, 4)

In [22]:
from fuzzywuzzy import fuzz

new_data = []
seen_prompts = []
seen_completions = []

def is_similar(text1, text2, threshold=90):
    """
    Compare two strings using fuzzy matching.
    If the similarity score is greater than the threshold, consider them duplicates.
    """
    return fuzz.ratio(text1, text2) > threshold

for index, row in new_df.iterrows():
    prompt = row['prompt']
    completion = row['completion']
    is_duplicate = False
    for seen_prompt, seen_completion in zip(seen_prompts, seen_completions):
        if is_similar(prompt, seen_prompt) and is_similar(completion, seen_completion):
            is_duplicate = True
            print("one found")
            break
    if not is_duplicate:
        new_row = row.copy()
        new_data.append(new_row)
        seen_prompts.append(prompt)
        seen_completions.append(completion)

one found
one found
one found
one found
one found
one found
one found
one found
one found
one found
one found
one found
one found


In [23]:
new_df = pd.DataFrame(new_data)

In [25]:
new_df.to_csv('prompts_instructs_dedup.csv', index=False)


In [34]:
from huggingface_hub import HfApi, HfFolder, Repository, DatasetCard, DatasetCardData
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd

In [35]:
hf_dataset = load_dataset("csv", data_files="prompts_instructs_ultra_revised.csv")


Generating train split: 0 examples [00:00, ? examples/s]

In [39]:
hf_dataset['train'][20000]

{'Type': 'Multiple-Choice',
 'Words': "['dualism', 'sun', 'studies', 'success', 'space']",
 'prompt': "Durant l'Imperi Romà, quina d'aquestes ciutats NO estava situada a la península Ibèrica?\nA)  Tarragona\nB)  Cartago\nC)  Sagunt\nD)  Empúries",
 'completion': 'B)  Cartago'}

In [40]:
hf_dataset = hf_dataset.shuffle()

In [41]:
hf_dataset.shape

{'train': (41176, 4)}

In [43]:
from huggingface_hub import create_repo

hf_dataset.push_to_hub("pauhidalgoo/patufet-premium-instruct")

print("Dataset uploaded successfully to Hugging Face!")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/386 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Dataset uploaded successfully to Hugging Face!
