# Loading Dataset from HuggingFace

In [1]:
import torch
from datasets import load_dataset

In [2]:
dataset1=load_dataset('lmsys/lmsys-chat-1m')

In [3]:
responses1 = dataset1["train"].remove_columns(["conversation_id", "model", "turn", "language", "openai_moderation", "redacted"])

In [4]:
dataset2=load_dataset('bitext/Bitext-customer-support-llm-chatbot-training-dataset')

In [5]:
dataset2=dataset2.remove_columns(['flags','category','intent'])

In [6]:
response2=dataset2['train']

# Data Preprocessing

In [7]:
import re

def find_duplicates(paragraphs):
    unique_x = set()
    duplicate_chars = 0
    duplicate_elements = 0
    for element in paragraphs:
        if element in unique_x:
            duplicate_chars += len(element)
            duplicate_elements += 1
        else:
            unique_x.add(element)
    return duplicate_elements, duplicate_chars

In [8]:
def paragraph_repetition_filter(text):
    paragraphs = re.compile(r'\n{2,}').split(text.strip())
    paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)
    if paragraphs_duplicates / len(paragraphs) > 0.2:
        return False
    if char_duplicates / len(text) > 0.2:
        return False
    return True

In [9]:
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

In [10]:
def preprocess_function(examples):
    preprocessed_texts = []
    for conversation in examples['conversation']:
        combined_text=""
        role=''
        for turn in conversation:
            if turn['role']=='user':
                role='Human'
            else:
                role='Assistant'
            content = turn['content']
            combined_text+=f'### {role}: {content}\n'
        
        if paragraph_repetition_filter(combined_text):
            cleaned_text = clean_text(combined_text)
            preprocessed_texts.append(cleaned_text)
        else:
            preprocessed_texts.append("")  # Add an empty string for skipped entries
        
    return {"text": preprocessed_texts}

In [11]:
def preprocess_function1(examples):
    preprocessed_texts = []
    

    clean_instruction = clean_text(examples['instruction'])
    clean_response = clean_text(examples['response'])
        
    # Combine instruction and response
    combined_text = f"### Human: {clean_instruction}\n\n### Assistant: {clean_response}"

    if paragraph_repetition_filter(combined_text):
        cleaned_text = clean_text(combined_text)
        preprocessed_texts.append(cleaned_text)
    else:
        preprocessed_texts.append("")
    
    return {"text": preprocessed_texts}

In [12]:
preprocessed_dataset = responses1.map(preprocess_function, batched=True)

In [13]:
preprocessed_dataset2=response2.map(preprocess_function1,batch_size=True)

In [14]:
preprocessed_dataset=preprocessed_dataset.remove_columns(['conversation'])

In [15]:
preprocessed_dataset2=preprocessed_dataset2.remove_columns(['instruction','response'])

In [16]:
preprocessed_dataset = preprocessed_dataset.filter(lambda example: bool(example['text']))

In [17]:
preprocessed_dataset2 = preprocessed_dataset2.filter(lambda example: bool(example['text']))

In [18]:
print(f"Number of examples after preprocessing and filtering: {len(preprocessed_dataset)}")

Number of examples after preprocessing and filtering: 976932


In [19]:
print(f"Number of examples after preprocessing and filtering: {len(preprocessed_dataset2)}")

Number of examples after preprocessing and filtering: 26872


In [20]:
import urllib
from fasttext import load_model
from fasttext.FastText import _FastText
import os

def english_language_filter(ds):
    #load language detection model

    current_dir = os.getcwd()
    model_path = os.path.join(current_dir, "lid.176.bin")
    

    model = load_model(model_path)

    def is_english(x):
        #Predict language of the text and probability
        language,score=model.predict(x['text'].replace('\n',''))

        language=language[0].split("__")[2]

        return score>0.4 and language=='en'

    ds=ds.filter(is_english,load_from_cache_file=False,num_proc=1)

    return ds

In [21]:
preprocessed_dataset=english_language_filter(preprocessed_dataset)



Filter:   0%|          | 0/976932 [00:00<?, ? examples/s]

In [22]:
def transform_dataset(example):
    # Join the list of strings if it's a list
    if isinstance(example['text'], list):
        joined_string = ' '.join(example['text'])
    else:
        joined_string = example['text']
    
    # Remove newline characters
    cleaned_string = joined_string.replace('\n', ' ')
    
    # Return a dictionary with the transformed text
    return {'text': cleaned_string}

In [23]:
preprocessed_dataset2=preprocessed_dataset2.map(transform_dataset)

In [24]:
preprocessed_dataset2=english_language_filter(preprocessed_dataset2)

Filter:   0%|          | 0/26872 [00:00<?, ? examples/s]

In [25]:
print(f"Number of examples after removing other language conversation: {len(preprocessed_dataset)}")

Number of examples after removing other language conversation: 784328


In [26]:
print(f"Number of examples after removing other language conversation: {len(preprocessed_dataset2)}")

Number of examples after removing other language conversation: 26872


In [27]:
preprocessed_dataset=preprocessed_dataset.shuffle(seed=10)

In [28]:
import numpy as np

def select_random_subset(dataset, num_samples):
    # Get the total number of rows in the dataset
    total_rows = len(dataset)
    
    # Generate random indices
    random_indices = np.random.choice(total_rows, num_samples, replace=False)
    
    # Select the random subset
    subset = dataset.select(random_indices)
    
    return subset

In [29]:
preprocessed_dataset=select_random_subset(preprocessed_dataset,52000)

In [30]:
preprocessed_dataset,preprocessed_dataset2

(Dataset({
     features: ['text'],
     num_rows: 52000
 }),
 Dataset({
     features: ['text'],
     num_rows: 26872
 }))

In [31]:
from datasets import concatenate_datasets

# Assuming your datasets are named preprocessed_dataset and preprocessed_dataset2
preprocessed_dataset = concatenate_datasets([preprocessed_dataset, preprocessed_dataset2])

In [32]:
preprocessed_dataset=preprocessed_dataset.shuffle(seed=42)

# Chat Prompt Template for llama 2

In [33]:
#In Case of Llama 2, the following prompt template is used for the chat models
#<S>[INST]<<SYS>>System Prompt<</SYS>>User Prompt[/INST]Model Answer</S>

In [34]:
#Define a function to transform the data
def transform_conversation(example):
  conversation_text=example['text']
  segments=conversation_text.split('###')

  reformatted_segments=[]

  #iterate over pair of segments
  for i in range(1,len(segments)-1,2):
    human_text=segments[i].strip().replace('Human:','').strip()

    #check if there is a corresponding assistance segemnt before processing

    if i+1<len(segments):
      assistance_text=segments[i+1].strip().replace('Assistant:','').strip()

      #Apply the new template
      reformatted_segments.append(f'<s>[INT] {human_text} [/INT] {assistance_text} </s>')
    else:
      #Handle the case where there is a no corresponding assistant segment
      reformatted_segments.append(f'<s>[INT] {human_text} [/INT] </s>')

  return {'text':''.join(reformatted_segments)}


In [35]:
transformed_dataset=preprocessed_dataset.map(transform_conversation)

Map:   0%|          | 0/78872 [00:00<?, ? examples/s]

In [37]:
transformed_dataset.save_to_disk('./')

Saving the dataset (0/1 shards):   0%|          | 0/78872 [00:00<?, ? examples/s]