In [None]:
import pandas as pd
import time
import re
import os

from sklearn.model_selection import train_test_split
from datasets import Dataset, load_from_disk

In [None]:
def list_files_in_directory(directory):
    files_list = []

    for entry in os.listdir(directory):
        full_path = os.path.join(directory, entry)

        if os.path.isfile(full_path):
            files_list.append(entry)

    return files_list

def extract_number(filename):
    match = re.search(r'-?\d+', filename)
    return int(match.group()) if match else None

def extract_non_number(filename):
    match = re.search(r'_', filename)
    return match.group() if match else None

def find_opening_closing(files):
    numbered_files = [(extract_number(file), file) for file in files if (extract_number(file)) != None]
    
    opening_file_tuple = min(numbered_files, key=lambda x: x[0])
    closing_file_tuple = max(numbered_files, key=lambda x: x[0])
    
    numbered_files.remove(opening_file_tuple)
    opening_file_1_tuple = min(numbered_files, key=lambda x: x[0])
    
    numbered_files.remove(opening_file_1_tuple)
    opening_file_2_tuple = min(numbered_files, key=lambda x: x[0])
    
    opening_file = opening_file_tuple[1]
    opening_file_1 = opening_file_1_tuple[1]
    opening_file_2 = opening_file_2_tuple[1]
    closing_file = closing_file_tuple[1]
    
    return opening_file_2, opening_file_1, opening_file, closing_file

def find_body_struktur(files):
    non_number_files = [(extract_non_number(file), file) for file in files if (extract_non_number(file)) != None]
    non_number_files = non_number_files[0][1]
    return non_number_files

In [None]:
def read_file(file_path):
    file_path = file_path.strip()
    if file_path == '':
        return
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except Exception as e:
        print(e)
        raise

def read_after_first_blank_line(file_content):
    content = []
    blank_line_found = False

    for line in file_content.splitlines():
        if line.strip() == "":
            blank_line_found = True
            continue

        if blank_line_found:
            content.append(line)

    return '\n'.join(content)

In [None]:
def process_dataset(part, new_df):
    total_rows = len(new_df)
    part_size = total_rows

    for i in range(1):
        print(f'Start Creating Dataset {part}...')

        idx = new_df.iloc[i*part_size:(i+1)*part_size]
        dataset = Dataset.from_pandas(idx)

        print(f'Start Saving Dataset {part}...')
        print(f'Saving at ../dataset-surface-info/{part}/{part}')
        dataset.save_to_disk(f'../dataset-surface-info/{part}/{part}')

In [None]:
def read_df_excluded():
    file_name = 'amandemen.csv'
    print(f'Start Reading Files {file_name}...')
    df = pd.read_csv(file_name)
    return df

def process_df_excluded(df, col_name):
    df['reg_id_lower'] = df[col_name].apply(lambda x: x.lower())
    df['reg_id'] = df[col_name]
    df = df[['reg_id', 'reg_id_lower']]
    return df

In [None]:
def read_df():
    file_name = 'core/regulatory_map_surface_info.csv'
    print(f'Start Reading Files {file_name}...')
    df = pd.read_csv(file_name)
    return df

def process_df(df, excluded_df):
    df['label'] = df['regulatory'].apply(lambda x: x.split('_')[0])
    df['regulatory_lower'] = df['regulatory'].apply(lambda x: x.lower())
    
    value_counts = df['label'].value_counts()
    df = df[~df['regulatory_lower'].isin(excluded_df['reg_id_lower'])]
    
    df = df[['regulatory', 'label', 'file_txt', 'file_ttl']]
    return df

In [None]:
def create_dataset_separate_surface(part, df):
    df = df.copy()
    
    if part == 'opening':
        idx = 3
    elif part == 'closing':
        idx = 4
    elif part == 'body struktur':
        idx = 5
    
    df['file_ttl'] = df['file_ttl'].apply(lambda x: x.replace('new_2_turtle_files', f'new_{idx}_turtle_files'))
    df['triples'] = df['file_ttl'].apply(lambda x: read_after_first_blank_line(read_file(x)))
    
    df['folder_txt'] = df['file_txt'].apply(lambda x: x.replace('new_1_text_files', 'new_split_txt').split('.')[0])
    df[['opening', 'opening_1', 'opening_2', 'closing']] = df['folder_txt'].apply(lambda x: pd.Series(find_opening_closing(list_files_in_directory(x))))
    
    if part == 'opening':
        df['txt'] = df['opening']
        df['txt_1'] = df['opening_1']
        df['txt_2'] = df['opening_2']
        
        df['file_txt'] = df['folder_txt'] + '/' + df['txt']
        df['file_txt_1'] = df['folder_txt'] + '/' + df['txt_1']
        df['file_txt_2'] = df['folder_txt'] + '/' + df['txt_2']
        
        df['text'] = df['file_txt'].apply(lambda x: read_file(x).strip())
        df['text_1'] = df['file_txt_1'].apply(lambda x: read_file(x).strip())
        df['text_2'] = df['file_txt_2'].apply(lambda x: read_file(x).strip())
        
        new_df = df[['regulatory', 'label', 'text', 'text_1', 'text_2', 'triples']]
        
    elif part == 'closing':
        df['txt'] = df['closing']
        
        df['file_txt'] = df['folder_txt'] + '/' + df['txt']
        df['text'] = df['file_txt'].apply(lambda x: read_file(x).strip())

        new_df = df[['regulatory', 'label', 'text', 'triples']]
        
    elif part == 'body struktur':
        df['file_txt'] = df['folder_txt'] + '/' + '_.txt'
        df['text'] = df['file_txt'].apply(lambda x: read_file(x).strip())
        
        df['file_txt_1'] = df['folder_txt'] + '/' + '*.txt'
        df['text_1'] = df['file_txt_1'].apply(lambda x: read_file(x).strip())
        
        new_df = df[['regulatory', 'label', 'text', 'text_1', 'triples']]
        
    return new_df

## Opening

In [None]:
df_excluded = read_df_excluded()
df_excluded = process_df_excluded(df_excluded, 'regulatory')
df_excluded

In [None]:
df = read_df()
df = process_df(df, df_excluded)
df

In [None]:
df['label'].value_counts()

In [None]:
new_df = create_dataset_separate_surface('opening', df)
new_df.reset_index(drop=True, inplace=True)
new_df

In [None]:
process_dataset('new-opening', new_df)

## Closing

In [None]:
new_df_2 = create_dataset_separate_surface('closing', df)
new_df_2.reset_index(drop=True, inplace=True)
new_df_2

In [None]:
process_dataset('new-closing', new_df_2)

## Body struktur

In [None]:
new_df_3 = create_dataset_separate_surface('body struktur', df)
new_df_3.reset_index(drop=True, inplace=True)
new_df_3

In [None]:
process_dataset('new-body-struktur', new_df_3)

## Chunk

In [None]:
from collections import Counter

opening_dataset_name = "../dataset-surface-info/new-dataset/new-opening/new-opening"
opening_dataset = load_from_disk(opening_dataset_name)
closing_dataset_name = "../dataset-surface-info/new-dataset/new-closing/new-closing"
closing_dataset = load_from_disk(closing_dataset_name)
body_st_dataset_name = "../dataset-surface-info/new-body-struktur/new-body-struktur"
body_st_dataset = load_from_disk(body_st_dataset_name)

In [None]:
def split_stratify(dataset, stratify_by_column, init=None):
    dataset = dataset.sort('regulatory')
    if init == None:
        dct = dataset.train_test_split(test_size=0.5, seed=42, stratify_by_column=stratify_by_column)
    else:
        dct = dataset.class_encode_column(stratify_by_column).train_test_split(test_size=0.5, seed=42, stratify_by_column=stratify_by_column)
    train, test = dct['train'], dct['test']
    return train, test


def create_chunk(dataset, part, stratify_by=None):
    label_counts = Counter(dataset[stratify_by])
    labels_to_keep = [label for label, count in label_counts.items() if count >= 20]
    filtered_dataset = dataset.filter(lambda x: x[stratify_by] in labels_to_keep)
    c_filtered_dataset = dataset.filter(lambda x: x[stratify_by] not in labels_to_keep)

    a, b = split_stratify(filtered_dataset, stratify_by, True)
    a1, a2 = split_stratify(a, stratify_by)
    b1, b2 = split_stratify(b, stratify_by)
    a11, a12 = split_stratify(a1, stratify_by)
    a21, a22 = split_stratify(a2, stratify_by)
    b11, b12 = split_stratify(b1, stratify_by)
    b21, b22 = split_stratify(b2, stratify_by)

    lst = [a11, a12, a21, a22, b11, b12, b21, b22, c_filtered_dataset]

    for i in range(len(lst)):
        process_dataset_chunk(part, lst[i], i+1)

In [None]:
def process_dataset_chunk(part, dataset, idx):
    print(f'Start Saving Dataset {part} {idx}...')
    print(f'Saving at ../dataset-surface-info/{part}/{part}-{idx}')
    dataset.save_to_disk(f'../dataset-surface-info/{part}/{part}-{idx}')

In [None]:
create_chunk(opening_dataset, 'new-new-opening', stratify_by='label')

In [None]:
create_chunk(closing_dataset, 'new-new-closing', stratify_by='label')

In [None]:
create_chunk(body_st_dataset, 'new-new-body-struktur', stratify_by='label')