In [1]:
import pandas as pd
import ast
import re
from tqdm import tqdm

# 1. Loading the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('validation.csv')

# Combine them into one big list of dialogs
all_dialogs = pd.concat([train_df['dialog'], test_df['dialog'], val_df['dialog']], ignore_index=True)

print(f"Loaded {len(all_dialogs)} total conversations.")

# Cleaning of data
def clean_dialog_robust(dialog_str):
    try:
        # A. Parse the stringified list safely (Removes outer [] and quotes automatically)
        parsed = ast.literal_eval(dialog_str)
        
        # B. Flatten list to a single string
        if isinstance(parsed, list):
            full_text = " ".join(parsed)
        else:
            full_text = str(parsed)
            
        # C. Regex Cleaning Steps
        
        # 1. Fix Chinese Punctuation
        text = full_text.replace('。', '.')
        
        # 2. Fix Smart Quotes & Apostrophes
        text = text.replace(" ’ ", "'").replace("’", "'")
        
        # 3. Fix Detached Contractions (e.g., "It ' s" -> "It's")
        text = re.sub(r" ' ([smtd]|re|ve|ll)\b", r"'\1", text)
        text = text.replace("Let ' s", "Let's") # Specific fix for your dataset
        
        # 4. FORCE SPLIT on Sentences (The most important step!)
        # Case 1: Fused (e.g., "time?I") -> Insert space & newline
        text = re.sub(r'([?.!])(?=[a-zA-Z])', r'\1 \n', text)
        # Case 2: Standard (e.g., "time? I") -> Replace space with newline
        text = re.sub(r'([?.!])\s+(?=[a-zA-Z])', r'\1 \n', text)
        
        # D. Final Split and Trim
        lines = text.split('\n')
        clean_lines = [line.strip() for line in lines if len(line.strip()) > 1]
        
        return clean_lines
        
    except Exception:
        return []

# 3. Run the loop and save to file
output_filename = "final_training_data_refined.txt"

print("Processing and cleaning...")
with open(output_filename, 'w', encoding='utf-8') as f:
    for dialog in tqdm(all_dialogs):
        sentences = clean_dialog_robust(dialog)
        for s in sentences:
            f.write(s + "\n")

print(f"Success! Clean data saved to '{output_filename}'")

Loaded 13118 total conversations.
Processing and cleaning...


100%|██████████| 13118/13118 [00:00<00:00, 20212.93it/s]

Success! Clean data saved to 'final_training_data_refined.txt'



