In [3]:
import pandas as pd
import os
import time
import csv
import re

In [4]:
def fix_all_formats(input_filepath, output_filepath):
    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
    
    data = []
    
    with open(input_filepath, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    entries = re.split(r'\n(?=[^",\n]+,")', content)
    
    if len(entries) <= 1:
        entries = re.split(r'\n"(?=[^"]+","[^"])', content)
    
    for entry in entries:
        entry = entry.strip()
        if not entry or 'judul' in entry.lower():
            continue
        
        judul, konten = parse_entry(entry)
        
        if judul and konten:
            judul = clean_text(judul)
            konten = clean_text(konten)
            
            data.append({'judul': judul, 'konten': konten})
    
    df = pd.DataFrame(data)
    
    df.to_csv(output_filepath, index=False, encoding='utf-8')
    
    print(f"✅ Formatting selesai!")
    print(f"   Data diproses: {len(data)} baris")
    print(f"   File disimpan: {output_filepath}")
    
    if len(df) > 0:
        print(f"\n📋 Sample hasil:")
        print("=" * 80)
        for i in range(min(3, len(df))):
            print(f"Data {i+1}:")
            print(f"  Judul: {df['judul'].iloc[i][:80]}...")
            print(f"  Konten: {df['konten'].iloc[i][:80]}...")
            print()

def parse_entry(entry):
    entry = entry.strip()
    
    if re.match(r'^[^"]+,"[^"]+', entry):
        first_quote = entry.find('"')
        judul = entry[:first_quote-1].strip() if first_quote > 0 else ""
        konten = entry[first_quote:].strip()
        
        if konten.startswith('"') and konten.endswith('"'):
            konten = konten[1:-1]
        elif konten.startswith('"'):
            konten = konten[1:]
        elif konten.endswith('"'):
            konten = konten[:-1]
            
        return judul, konten

    elif re.match(r'^"[^"]+","[^"]+', entry):
        parts = entry.split('","', 1)
        if len(parts) == 2:
            judul = parts[0][1:].strip() if parts[0].startswith('"') else parts[0].strip()
            konten = parts[1][:-1].strip() if parts[1].endswith('"') else parts[1].strip()
            return judul, konten
    
    elif entry.startswith('"""') and entry.endswith('"""'):
        entry = entry[3:-3] 
        return parse_entry(entry)  
    
   
    elif ',"""' in entry and entry.endswith('"""'):
        parts = entry.split(',"""', 1)
        if len(parts) == 2:
            judul = parts[0].strip()
            konten = parts[1][:-3].strip() 
            return judul, konten
    
   
    elif ',' in entry:
        first_comma = entry.find(',')
        judul = entry[:first_comma].strip()
        konten = entry[first_comma+1:].strip()
        
       
        judul = judul.strip('"')
        konten = konten.strip('"')
        
        return judul, konten
    
    return None, None

def clean_text(text):
    if not text:
        return ""
    
   
    text = re.sub(r'"+', '"', text)
    
   
    text = text.strip()
    while text.startswith('"') and text.endswith('"'):
        text = text[1:-1].strip()
    
   
    text = text.replace("–", "-")
    text = text.replace("”", '"')
    text = text.replace("“", '"')
    
   
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def process_complex_file(input_filepath, output_filepath):
    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
    
    data = []
    current_entry = []
    
    with open(input_filepath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            
            if not line or 'judul' in line.lower():
                continue
            
           
            if re.match(r'^[^",]+,"[^"]+', line) and current_entry:
                process_current_entry(current_entry, data)
                current_entry = []
            
            current_entry.append(line)
        
       
        if current_entry:
            process_current_entry(current_entry, data)
    
    
    df = pd.DataFrame(data)
    
   
    df.to_csv(output_filepath, index=False, encoding='utf-8')
    
    print(f"✅ Complex formatting selesai!")
    print(f"   Data diproses: {len(data)} baris")
    print(f"   File disimpan: {output_filepath}")

def process_current_entry(entry_lines, data):
  
    full_entry = ' '.join(entry_lines)
    judul, konten = parse_entry(full_entry)
    
    if judul and konten:
        judul = clean_text(judul)
        konten = clean_text(konten)
        data.append({'judul': judul, 'konten': konten})


input_filepath = os.path.join('datasets', 'etd_ugm.csv')
output_filepath = os.path.join('datasets_format', 'etd_ugm.csv')

print("Memulai proses formatting...")
print(f"  Input : {input_filepath}")
print(f"  Output: {output_filepath}")
try:
    fix_all_formats(input_filepath, output_filepath)
except Exception as e:
    print(f"❌ Method 1 failed: {e}")
    print("Trying complex method...")
    try:
        process_complex_file(input_filepath, output_filepath)
    except Exception as e2:
        print(f"❌ All methods failed: {e2}")

Memulai proses formatting...
  Input : datasets\etd_ugm.csv
  Output: datasets_format\etd_ugm.csv
✅ Formatting selesai!
   Data diproses: 7948 baris
   File disimpan: datasets_format\etd_ugm.csv

📋 Sample hasil:
Data 1:
  Judul: Peningkatan Penggunaan Serangan Drone dalam Perang Melawan Teror di Afghanistan ...
  Konten: Drone secara umum dapat diklasifikasikan sebagai semua kendaraan baik darat, lau...

Data 2:
  Judul: PREDIKSI HARGA APARTEMEN DENGAN MEMPERTIMBANGKAN FAKTOR SOCIO ECONOMIC DAN CRIME...
  Konten: Properti di Indonesia saat ini merupakan bentuk investasi yang cukup menjanjikan...

Data 3:
  Judul: EFEK PEMBERIAN PROGRAM PENURUNAN BERAT BADAN BERBASIS DIGITAL OBi-MOBi TERHADAP ...
  Konten: Latar Belakang : Obesitas merupakan salah satu masalah kesehatan yang prevalensi...

