In [1]:
import re
import os
import json

In [1]:
def find_columns(data):
    lines = data.strip().split('\n')
    columns = lines[0].split('=')[1].strip().split()
    return columns

def parse_metadata(data, filename):
    lines = data.strip().split('\n')

    if len(lines) < 7:
        return -1;
        
    metadata_dict = {}
    metadata_dict['document_id'] = lines[1].split('=')[1].strip()
    metadata_dict['eurovoc'] = lines[2].split('=')[1].strip().split()
    metadata_dict['title'] = lines[3].split('=')[1].strip()
    metadata_dict['date'] = lines[4].split('=')[1].strip()
    metadata_dict['doctype'] = lines[5].split('=')[1].strip()
    metadata_dict['url'] = lines[6].split('=')[1].strip()
    metadata_dict['language'] = lines[7].split('=')[1].strip()
    metadata_dict['filename'] = filename
    return metadata_dict

def parse_data_file(data, columns):
    lines = data.strip().split('\n')
    sentences = re.split(r'# sent_id = .*?\n# text = ', data.strip())
    sentences.pop(0)

    result_text = []
    for sentence in sentences:
        lines = sentence.split('\n')

        text = lines[0];

        if len(text) < 10:
            continue;
        
        result_dict = {}
        metadata_sentence = []

        for line in lines[1:]:
            cols = line.strip().split('\t')
            if len(cols) < 10:
                continue;
                
            tokens = {
            "ID": cols[0],
            "FORM": cols[1],
            "LEMMA": cols[2],
            "UPOS": cols[3],
            "XPOS": cols[4],
            "FEATS": cols[5],
            "HEAD": cols[6],
            "DEPREL": cols[7],
            "DEPS": cols[8],
            "MISC": cols[9]
            }
            
            metadata_sentence.append(tokens)
            
        if len(metadata_sentence) < 5:
            continue;
            
        result_dict['text'] = text;
        result_dict['metadata_text'] = metadata_sentence
        result_text.append(result_dict)
    return result_text

In [12]:
file_name = 'ro-annotated/mj_00000G000BRZ93RU63D3NY0STH13Q23L.conllup'
with open(file_name, "r", encoding="utf-8") as file:
    data = file.read()
    columns = find_columns(data);

directory = 'ro-annotated/'
parsed_data = []

i = 0;
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read()
    
    metadata = parse_metadata(data, filename)

    if metadata == -1:
        continue;

    if int(metadata["date"]) < 2007:
        continue;
        
    parsed_text = parse_data_file(data, columns)
        
    # final_dict = {}
    # final_dict['metadata'] = metadata;
    # final_dict['parsed_text'] = parsed_text;
        
    # parsed_data.append(final_dict)
    i += 1
      
    # if i % 1 == 0:
    # output_file = filename.replace(".conllup", ".json")
    output_file = f'ro-parsed-single/{i}.jsonl'
    with open(output_file, "w", encoding="utf-8") as json_file:
        for example_parsed in parsed_text:
            json.dump(example_parsed, json_file, ensure_ascii=False)#, indent=4)
            json_file.write('\n')
        # parsed_data = []

# output_file = filename.replace(".conllup", ".json")
# output_file = f'ro-parsed/{output_file}.json'
# with open(output_file, "w", encoding="utf-8") as json_file:
    # json.dump(parsed_data, json_file, ensure_ascii=False, indent=4)

print("Parsed data saved to", output_file)

Parsed data saved to ro-parsed-single/104180.jsonl


In [9]:
import os
import pandas as pd
from io import StringIO
import pyarrow as pa
import pyarrow.parquet as pq

def jsonl_files_to_dataframe(directory_path):
    data = []
    i = 0;
    j = 0;
    for file_name in os.listdir(directory_path):
        jsonl_file_path = os.path.join(directory_path, file_name)
        with open(jsonl_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(pd.read_json(StringIO(line), lines=True))
        i += 1
        if i == 1000:
            i = 0
            j += 1
            df = pd.concat(data, ignore_index=True)
            table = pa.Table.from_pandas(df);
            pq.write_table(table, f'./ro-parsed-parquet/{j}.parquet')
            data = []
    
    # return df

directory_path = './ro-parsed-single/'
jsonl_files_to_dataframe(directory_path)