### Upload

In [None]:
# Import the required modules
import argparse
from xml.dom import minidom
import html
import re
import os

!python -m nltk.downloader punkt
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# INPUT FILES
# Set the input directory
from google.colab import files

# Create the Input folder
if not os.path.exists('/content/Input'):
    os.makedirs('/content/Input')

# Create the Output folder
if not os.path.exists('/content/Output'):
    os.makedirs('/content/Output')

# Upload the files
uploaded_files = files.upload()

# Set the input directory
input_dir = '/content/Input'

# Iterate over the uploaded files
for file_name, data in uploaded_files.items():
    # Construct the input file path
    input_path = os.path.join(input_dir, file_name)
    
    # Write the uploaded file to the input directory
    with open(input_path, 'wb') as f:
        f.write(data)

Saving de_gsd-ud_CLEAN.conllu to de_gsd-ud_CLEAN.conllu
Saving de_hdt-ud_CLEAN.conllu to de_hdt-ud_CLEAN.conllu
Saving de_pud-ud_CLEAN.conllu to de_pud-ud_CLEAN.conllu


## Conllu to XML

### Let's see if I can replace the charaters


In [None]:
# Set the input and output directories
args = argparse.Namespace()
args.input_dir = '/content/Input'
args.output_dir = '/content/Output'

# Iterate over all the .conllu files in the input directory
for filename in os.listdir(args.input_dir):
    if not filename.endswith('.conllu'):
        continue
    
    # Construct the full file path
    file_path = os.path.join(args.input_dir, filename)

    # Open the file in read mode
    with open(file_path, 'r') as f:
        conllu_string = f.read()
        
        # Split the conllu string into a list of lines
        conllu_lines = conllu_string.split('\n')

        # Create the xml document and root element
        xmldoc = minidom.Document()
        root = xmldoc.createElement('trf')
        xmldoc.appendChild(root)

        # Iterate over the list of lines and extract the values from each line
        conllu_parsed = []
        sentence = []
        id = 1

        for line in conllu_lines:
            if line.startswith('#'):
                continue
            elif line == '':
                conllu_parsed.append(sentence)
                sentence = []
            else:
                columns = line.split('\t')
                form = columns[1]
                upos = columns[3]
                feats = columns[5]
                token = {'form': form, 'upos': upos, 'feats': feats}
                sentence.append(token)            

        # Iterate over the list of sentences and create the xml elements
        for i, sentence in enumerate(conllu_parsed):

          # Create the qitext element and set the id attribute
          qitext = xmldoc.createElement('qitext')
          qitext.setAttribute('id', str(id))

          # Create the plain element and add it to the qitext element
          plain = xmldoc.createElement('plain')
          plain_text = ' '.join([token['form'] for token in sentence])
          plain.appendChild(xmldoc.createTextNode(plain_text))
          qitext.appendChild(plain)

          # Set the l attribute of the qitext element to the length of the plain text
          qitext.setAttribute('l', str(len(plain_text)))

          # Parse the plain text into a list of sentences using the nlp function
          sentences = sent_tokenize(plain_text, language='french')

          # Iterate over the list of tokens and create the qitoken elements
          start = 0
          sentence_counter = 0    

          for j, token in enumerate(sentence):
            qitoken = xmldoc.createElement('qitoken')
            qitoken_text = f"{token['form']} {token['upos']}:{token['feats']}"
            qitoken_text = re.sub(r'&(?!amp;|lt;|gt;)', '&amp;', qitoken_text)
            qitoken_text = re.sub(r'<http', '&lt;http', qitoken_text)
            qitoken_text = re.sub(r'<', '&lt;', qitoken_text)
            qitoken_text = re.sub(r'>', '&gt;', qitoken_text)
            qitoken_text = re.sub(r'<(?=[a-zA-Z])', '&lt;', qitoken_text)
            qitoken_text = re.sub(r'>(?=[a-zA-Z])', '&gt;', qitoken_text)
            qitoken.appendChild(xmldoc.createTextNode(qitoken_text))
            qitoken.setAttribute('start', str(start))
            start += len(token['form']) + 1
            qitoken.setAttribute('end', str(start - 1))
            qitoken.setAttribute('sentence', str(sentence_counter + 1))
            qitext.appendChild(qitoken)
            
            
            # Increment the sentence counter if the current token is the last in a sentence
            if sentence_counter < len(sentences) and token['form'] == sentences[sentence_counter][-1]:
              sentence_counter += 1

          root.appendChild(qitext)

          # Increment the id counter
          id += 1

        # Construct the output file path
        output_filename = filename.replace('.conllu', '.xml')
        output_path = os.path.join(args.output_dir, output_filename)
    
        # Write the xml document to the output file
        with open(output_path, 'w') as f:
          f.write(html.unescape(xmldoc.toprettyxml()))

        # Print the number of xml elements created
        print(f'{len(root.getElementsByTagName("qitext"))} qitext elements created')

185380 qitext elements created


### This version (for simple cases)

In [None]:
# Set the input and output directories
args = argparse.Namespace()
args.input_dir = '/content/Input'
args.output_dir = '/content/Output'

# Iterate over all the .conllu files in the input directory
for filename in os.listdir(args.input_dir):
    if not filename.endswith('.conllu'):
        continue
    
    # Construct the full file path
    file_path = os.path.join(args.input_dir, filename)

    # Open the file in read mode
    with open(file_path, 'r') as f:
        conllu_string = f.read()
        
        # Split the conllu string into a list of lines
        conllu_lines = conllu_string.split('\n')

        # Create the xml document and root element
        xmldoc = minidom.Document()
        root = xmldoc.createElement('trf')
        xmldoc.appendChild(root)

        # Iterate over the list of lines and extract the values from each line
        conllu_parsed = []
        sentence = []
        id = 1

        for line in conllu_lines:
            if line.startswith('#'):
                continue
            elif line == '':
                conllu_parsed.append(sentence)
                sentence = []
            else:
                columns = line.split('\t')
                form = columns[1]
                upos = columns[3]
                feats = columns[5]
                token = {'form': form, 'upos': upos, 'feats': feats}
                sentence.append(token)            

        # Iterate over the list of sentences and create the xml elements
        for i, sentence in enumerate(conllu_parsed):

          # Create the qitext element and set the id attribute
          qitext = xmldoc.createElement('qitext')
          qitext.setAttribute('id', str(id))

          # Create the plain element and add it to the qitext element
          plain = xmldoc.createElement('plain')
          plain_text = ' '.join([token['form'] for token in sentence])
          plain.appendChild(xmldoc.createTextNode(plain_text))
          qitext.appendChild(plain)

          # Set the l attribute of the qitext element to the length of the plain text
          qitext.setAttribute('l', str(len(plain_text)))

          # Parse the plain text into a list of sentences using the nlp function
          sentences = sent_tokenize(plain_text, language='french')

          # Iterate over the list of tokens and create the qitoken elements
          start = 0
          sentence_counter = 0

          for j, token in enumerate(sentence):
            qitoken = xmldoc.createElement('qitoken')
            qitoken_text = f"{token['form']} {token['upos']}:{token['feats']}"
            qitoken.appendChild(xmldoc.createTextNode(qitoken_text))
            qitoken.setAttribute('start', str(start))
            start += len(token['form']) + 1
            qitoken.setAttribute('end', str(start - 1))
            qitoken.setAttribute('sentence', str(sentence_counter + 1))
            qitext.appendChild(qitoken)
            
            
            # Increment the sentence counter if the current token is the last in a sentence
            if sentence_counter < len(sentences) and token['form'] == sentences[sentence_counter][-1]:
              sentence_counter += 1

          root.appendChild(qitext)

          # Increment the id counter
          id += 1

        # Construct the output file path
        output_filename = filename.replace('.conllu', '.xml')
        output_path = os.path.join(args.output_dir, output_filename)
    
        # Write the xml document to the output file
        with open(output_path, 'w') as f:
          f.write(html.unescape(xmldoc.toprettyxml()))

        # Print the number of xml elements created
        print(f'{len(root.getElementsByTagName("qitext"))} qitext elements created')


999 qitext elements created
15559 qitext elements created
185380 qitext elements created


### OLD version

In [None]:
from html import escape
import argparse
import os
from xml.dom import minidom

# Set the input and output directories
args = argparse.Namespace()
args.input_dir = '/content/Input'
args.output_dir = '/content/Output'

# Iterate over all the .conllu files in the input directory
for filename in os.listdir(args.input_dir):
    if not filename.endswith('.conllu'):
        continue
    
    # Construct the full file path
    file_path = os.path.join(args.input_dir, filename)

    # Open the file in read mode
    with open(file_path, 'r') as f:
        conllu_string = f.read()
        
        # Split the conllu string into a list of lines
        conllu_lines = conllu_string.split('\n')

        # Create the xml document and root element
        xmldoc = minidom.Document()
        root = xmldoc.createElement('root')

        # Add the XML declaration to the document
        #xmldoc.appendChild(xmldoc.createProcessingInstruction("xml", "version='1.0' encoding='UTF-8'"))

        # Append the root element to the xml document
        xmldoc.appendChild(root)

        # Iterate over the list of lines and extract the values from each line
        conllu_parsed = []
        sentence = []

        for line in conllu_lines:
            if line.startswith('#'):
                continue
            elif line == '':
                conllu_parsed.append(sentence)
                sentence = []
            else:
                columns = line.split('\t')
                form = columns[1]
                upos = columns[3]
                feats = columns[5]
                token = {'form': form, 'upos': upos, 'feats': feats}
                sentence.append(token)

        # Iterate over the list of sentences and create the xml elements
        for i, sentence in enumerate(conllu_parsed):
            # Create the qitext element and set the id attribute
            qitext = xmldoc.createElement('qitext')

            # Create the plain element and add it to the qitext element
            plain = xmldoc.createElement('plain')
            plain_text = ' '.join([token['form'] for token in sentence])
            plain_text = escape(plain_text)  # Escape special characters in plain_text
            plain.appendChild(xmldoc.createTextNode(plain_text))
            qitext.appendChild(plain)

            # Parse the plain text into a list of sentences using the nlp function
            sentences = sent_tokenize(plain_text, language='french')

            for j, token in enumerate(sentence):
                qitoken = xmldoc.createElement('qitoken')
                qitoken_text = f"{token['form']} {token['upos']}:{token['feats']}"
                qitoken.appendChild(xmldoc.createTextNode(qitoken_text))
                qitext.appendChild(qitoken)
            root.appendChild(qitext)

        # Construct the output file path
        output_filename = filename.replace('.conllu', '.xml')
        output_path = os.path.join(args.output_dir, output_filename)
    
        # Write the xml document to the output file
        with open(output_path, 'w') as f:
            f.write(html.unescape(xmldoc.toprettyxml()))

        # Print the number of xml elements created
        print(f'{len(root.getElementsByTagName("qitext"))} qitext elements created')


999 qitext elements created
11800 qitext elements created


# DOWNLOAD

In [None]:
from google.colab import files
def download_xml_files():
    # Set the input and output directories
    input_dir = '/content/Output'

    # Iterate over the .conllu files in the input directory
    for filename in os.listdir(input_dir):
        if not filename.endswith('.trf'):
            continue

        # Construct the full file path
        file_path = os.path.join(input_dir, filename)

        # Download the file
        files.download(file_path)

download_xml_files()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# XML2TRF

In [None]:
#not correct
import os

# Set the input and output file paths
input_file = '/content/Output/fr_pud-ud_DONE.xml'
output_file = '/content/fr_pud-ud.xml'#?TRF

# Open the input file and read the contents into a variable
with open(input_file, 'r') as f:
    xml_data = f.read()

# Replace the existing XML declaration with the new one
xml_data = xml_data.replace('<?xml version="1.0" ?>', '<?xml version=\'1.0\' encoding=\'UTF8\' ?>\n<format>3.0</format>')

# Write the modified data to the output file
with open(output_file, 'w') as f:
    f.write(xml_data)

In [None]:
#supposed to work but gives errror
import codecs
from lxml import etree

# Set the input and output directories
input_dir = '/content/Input'
output_dir = '/content/Outpu'

# Iterate over all files in the input directory
for xml_file in os.listdir(input_dir):
  # Check if the file is an XML file
  if xml_file.endswith('.xml'):
    # Get the full file path
    xml_file_path = os.path.join(input_dir, xml_file)

    # Get the TRF file name from the XML file name
    trf_file = os.path.splitext(xml_file)[0] + ".trf"

    # Open the XML file and ignore errors
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_file, parser=parser)
    root = tree.getroot()

    # Open the TRF file
    trf = codecs.open(trf_file, "w", "utf-8")

    # Write the TRF header
    trf.write('<?xml version=\'1.0\' encoding=\'UTF8\' ?>\n')
    trf.write("<format>3.0</format>\n")

    id_counter = 1

    # Get the qitext elements
    qitext_elements = root.findall("./qitext")

    # Iterate over the qitext elements
    for qitext_element in qitext_elements:
      # Get the text for the qitext element
      text_element = qitext_element.find("plain")
    if text_element is not None:
      text = text_element.text
    else:
      text = ""
    
    # Get the length of the text
    length = len(text) if text is not None else 0
    
    # Write the qitext element
    trf.write("<qitext id='{}' l='{}'>\n".format(id_counter, length))
    trf.write("<plain>{}</plain>\n".format(text))
    
    # Get the qitoken elements
    qitoken_elements = qitext_element.findall("qitoken")

    for token in qitoken_elements:
      start = token.attrib['start']
      end = token.attrib['end']
      sentence = token.attrib['sentence']
      trf.write("<qitoken start='{}' end='{}' sentence='{}'>{} WORD</qitoken>\n".format(start, end, sentence, token.text))

    #for token in qitoken_elements:
      #trf.write("<qitoken>{} WORD</qitoken>\n".format(token.text))

    # Write the end of the qitext element
    trf.write("</qitext>\n")

    # Increment the ID counter
    id_counter += 1

    # Close the TRF file
    trf.close()
    
    # Write the TRF file 
    print("TRF file created: " + trf_file)

OSError: ignored

# This one

Issues:
* & not printed
* numbers

#### & case

In [None]:
import os
import re

input_directory = "/content/Input"
output_directory = "/content/Output"

for filename in os.listdir(input_directory):
    if filename.endswith(".xml"):
        input_file_path = os.path.join(input_directory, filename)
        output_file_path = os.path.join(output_directory, filename)

        with open(input_file_path, "r") as f:
            xml_str = f.read()

        xml_str = re.sub(r'&(?!(amp|lt|gt);)', '&amp;', xml_str)

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        with open(output_file_path, "w") as f:
            f.write(xml_str)

#with open("/content/Input/de_gsd-ud_CLEAN.xml", "r") as f:
#    xml_str = f.read()

#xml_str = re.sub(r'&(?!(amp|lt|gt);)', '&amp;', xml_str)

#with open("modified_file.xml", "w") as f:
#    f.write(xml_str)


##To trf

In [None]:
import os
import codecs
from lxml import etree

# Get the XML file name from the directory
xml_file = '/content/Output/de_hdt-ud_CLEAN.xml'

# Get the TRF file name from the XML file name
trf_file = os.path.splitext(xml_file)[0] + ".trf"

# Open the XML file and ignore errors
parser = etree.XMLParser(recover=True)
tree = etree.parse(xml_file, parser=parser)
root = tree.getroot()

# Open the TRF file
trf = codecs.open(trf_file, "w", "utf-8")

# Write the TRF header
trf.write('<?xml version=\'1.0\' encoding=\'UTF8\' ?>\n')
trf.write("<format>3.0</format>\n")

# Initialize the ID counter
id_counter = 1

# Get the qitext elements
qitext_elements = root.findall("./qitext")

# Iterate over the qitext elements
for qitext_element in qitext_elements:
    # Get the text for the qitext element
    text_element = qitext_element.find("plain")
    if text_element is not None:
        text = text_element.text
    else:
        text = ""
    
    # Get the length of the text
    length = len(text) if text is not None else 0
    
    # Write the qitext element
    trf.write("<qitext id='{}' l='{}'>\n".format(id_counter, length))
    trf.write("<plain>{}</plain>\n".format(text))
    
    # Get the qitoken elements
    qitoken_elements = qitext_element.findall("qitoken")

    for token in qitoken_elements:
      start = token.attrib['start']
      end = token.attrib['end']
      #sentence = '1' # added the sentence variable
      trf.write("<qitoken start='{}' end='{}' sentence='1'>{} WORD OTHER</qitoken>\n".format(start, end, token.text))
      #trf.write("<qitoken start='{}' end='{}'>{} WORD</qitoken>\n".format(start, end, token.text))

    # Write the end of the qitext element
    trf.write("</qitext>\n")

    # Increment the ID counter
    id_counter += 1

# Close the TRF file
trf.close()

# Write the TRF file 
print("TRF file created: " + trf_file)

TRF file created: /content/Output/de_hdt-ud_CLEAN.trf


#Let's parse it

In [None]:
import os
import re

# Specify the folder containing the TRF files
folder = "/content/Output"

# Use the os.listdir() function to find all TRF files in the specified folder
for filename in os.listdir(folder):
    if filename.endswith(".trf"):
        filepath = os.path.join(folder, filename)

        # Open the TRF file
        with open(filepath, "r") as f:
            trf_file = f.read()

        # Use regular expressions to find all instances of "qitoken" tags and remove ":_" from the 2nd argument
        trf_file = re.sub(r'qitoken(.*?) (\w+):_', r'qitoken\1 \2', trf_file)

        # Save the modified TRF file
        with open(filepath, "w") as f:
            f.write(trf_file)

# REDUCE THE SIZE


In [15]:
def delete_lines(file_path, start_id):
    new_lines = []
    with open(file_path, "r") as file:
        lines = file.readlines()
        for line in lines:
            if "qitext id" in line and int(line.split("'")[1]) < start_id:
                new_lines.append(line)
            elif "qitext id" in line and int(line.split("'")[1]) >= start_id:
                break
            else:
                new_lines.append(line)

    with open(file_path, "w") as file:
        file.writelines(new_lines)

delete_lines("/content/es_ancora-ud.trf",10001)