In [5]:
import xml.etree.ElementTree as ET
from io import StringIO
import gzip
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd

def extract_text(token):
    return token.find('orth').text

def extract_info_for_df(token):
    orth = token.find('orth').text
    base = token.find(".//lex/base").text
    ctag = token.find(".//lex/ctag").text
    return {'Orth': orth, 'Base': base, 'CTag': ctag}

def split_ctag(ctag):
    return ctag.split(':')

def process_text(train_data, window_size, prefix_len, suffix_len):
    x_train_data = []
    punct_list = [".", "?", "!"]

    for i in range(len(train_data)):
        prv = []
        nxt = []
        curr = train_data[i]["Orth"]
        count = -window_size

        for j in range(i - window_size, i + window_size + 1):
            if 0 <= j < len(train_data) and j != i:
                word = train_data[j]["Orth"]
                if word not in punct_list:
                    prv.append(word)

        while len(prv) < window_size:
            prv.insert(0, "_")
        while len(nxt) < window_size:
            nxt.append("_")

        prefix = curr[:prefix_len]
        suffix = curr[-suffix_len:]

        row = prv + [prefix] + [curr] + [suffix] + nxt
        x_train_data.append(row)

    column_names = [f"Column {i+1}" for i in range(2*window_size + 3)]
    df = pd.DataFrame(x_train_data, columns=column_names)
    return df


In [6]:
import pandas as pd
import xml.etree.ElementTree as ET
from io import StringIO
import gzip

# extract text from token
def extract_text(token):
    orth_text = token.find('orth').text
    return orth_text

# extract information for dataframe
def extract_info_for_df(token):
    orth = token.find('orth').text
    base = token.find(".//lex/base").text
    ctag = token.find(".//lex/ctag").text
    return {'Orth': orth, 'Base': base, 'CTag': ctag}

# split CTag
def split_ctag(ctag):
    ctag_parts = ctag.split(':')
    return ctag_parts

# process XML data
def process_xml(xml_file):
    with gzip.open(xml_file, 'rb') as f:
        xml_content = f.read()

    xml_content_decoded = xml_content.decode('utf-8')
    print(len(xml_content_decoded))

    tree = ET.parse(StringIO(xml_content_decoded))
    root = tree.getroot()
    train_data = []
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            orth = tok.find('orth').text
            lex_base = tok.find(".//lex/base").text
            ctag = tok.find(".//lex/ctag").text
            train_data.append({"Orth": orth, "Lex_Base": lex_base, "Ctag": ctag})
            # print(f"Token: {orth}, Base: {lex_base}, CTag: {ctag}")

    corpus = []
    c = ""
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            text = extract_text(tok)
            corpus.append(text)
            c += text

    # Saving the corpus
    file_path = "../Code/corpus.txt"
    with open(file_path, "w", encoding="utf-8") as file:
        for text in corpus:
            file.write(text + "\n")

    data = []
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            data.append(extract_info_for_df(tok))

    df = pd.DataFrame(data)

    df['SplitCTag'] = df['CTag'].apply(split_ctag)

    split_ctags_flat = [tag for sublist in df['SplitCTag'] for tag in sublist]

    num_pos_labels = len(set(split_ctags_flat))

    print(f'There are {num_pos_labels} classes (POS labels).')

    labels = df['CTag']
    labels.to_csv("labels.csv", index=False)

    df.to_csv("dataframe.csv")
    return train_data

# process training data for different models
def process_training_data(train_data, window_size, output_file):
    if window_size == "baseline":
        train_df = baseline_train_df(train_data)
    elif window_size == 9:
        train_df = improved_9_train_df(train_data)
    elif window_size == 5:
        train_df = improved_5_train_df(train_data)
    elif window_size == 3:
        train_df = improved_3_train_df(train_data)
    else:
        print("Invalid window size.")
        return None

    train_df.to_csv(output_file, index=False)
    return train_df

# process XML data
xml_gz_file = "../Data/train.xml.gz"
train_data = process_xml(xml_gz_file)

# process training data for different models
baseline_train_df = process_training_data(train_data, "baseline", "X_train.csv")
improved_9_train_df = process_training_data(train_data, 9, "X_train_9.csv")
improved_5_train_df = process_training_data(train_data, 5, "X_train_5.csv")
improved_3_train_df = process_training_data(train_data, 3, "X_train_3.csv")


83596176
There are 71 classes (POS labels).


NameError: name 'baseline_train_df' is not defined

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from io import StringIO
import gzip

# extract text from token
def extract_text(token):
    orth_text = token.find('orth').text
    return orth_text

# extract information for dataframe
def extract_info_for_df(token):
    orth = token.find('orth').text
    base = token.find(".//lex/base").text
    ctag = token.find(".//lex/ctag").text
    return {'Orth': orth, 'Base': base, 'CTag': ctag}

# split CTag
def split_ctag(ctag):
    ctag_parts = ctag.split(':')
    return ctag_parts

# process XML data
def process_xml(xml_file):
    with gzip.open(xml_file, 'rb') as f:
        xml_content = f.read()

    xml_content_decoded = xml_content.decode('utf-8')
    print(len(xml_content_decoded))

    tree = ET.parse(StringIO(xml_content_decoded))
    root = tree.getroot()
    validation_data = []
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            orth = tok.find('orth').text
            lex_base = tok.find(".//lex/base").text
            ctag = tok.find(".//lex/ctag").text
            validation_data.append({"Orth": orth, "Lex_Base": lex_base, "Ctag": ctag})

    val_corpus = []
    c = ""
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            text = extract_text(tok)
            val_corpus.append(text)
            c += text

    file_path = "../Code/validation_corpus.txt"
    with open(file_path, "w", encoding="utf-8") as file:
        for text in val_corpus:
            file.write(text + "\n")

    data = []
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            data.append(extract_info_for_df(tok))

    df = pd.DataFrame(data)

    df['SplitCTag'] = df['CTag'].apply(split_ctag)

    split_ctags_flat = [tag for sublist in df['SplitCTag'] for tag in sublist]

    num_pos_labels = len(set(split_ctags_flat))

    print(f'There are {num_pos_labels} classes (POS labels).')

    labels = df['CTag']
    labels.to_csv("validation_labels.csv", index=False)

    df.to_csv("validation_dataframe.csv")
    return validation_data

# Function to process validation data for different models
def process_validation_data(validation_data, window_size, output_file):
    if window_size == "baseline":
        val_df = baseline_train_df(validation_data)
    elif window_size == 9:
        val_df = improved_9_train_df(validation_data)
    elif window_size == 5:
        val_df = improved_5_train_df(validation_data)
    elif window_size == 3:
        val_df = improved_3_train_df(validation_data)
    else:
        print("Invalid window size.")
        return None

    val_df.to_csv(output_file, index=False)
    return val_df

# Process XML data
xml_gz_file = "../Data/validate.xml.gz"
validation_data = process_xml(xml_gz_file)

# Process validation data for different models
baseline_val_df = process_validation_data(validation_data, "baseline", "validation.csv")
improved_9_val_df = process_validation_data(validation_data, 9, "validation_9.csv")
improved_5_val_df = process_validation_data(validation_data, 5, "validation_5.csv")
improved_3_val_df = process_validation_data(validation_data, 3, "validation_3.csv")


28082182
There are 71 classes (POS labels).


In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from io import StringIO
import gzip

# Function to extract text from token
def extract_text(token):
    orth_text = token.find('orth').text
    return orth_text

# Function to extract information for dataframe
def extract_info_for_df(token):
    orth = token.find('orth').text
    base = token.find(".//lex/base").text
    ctag = token.find(".//lex/ctag").text
    return {'Orth': orth, 'Base': base, 'CTag': ctag}

# Function to split CTag
def split_ctag(ctag):
    ctag_parts = ctag.split(':')
    return ctag_parts

# Function to process XML data
def process_xml(xml_file):
    with gzip.open(xml_file, 'rb') as f:
        xml_content = f.read()

    xml_content_decoded = xml_content.decode('utf-8')
    print(len(xml_content_decoded))

    tree = ET.parse(StringIO(xml_content_decoded))
    root = tree.getroot()
    test_data = []
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            orth = tok.find('orth').text
            lex_base = tok.find(".//lex/base").text
            ctag = tok.find(".//lex/ctag").text
            test_data.append({"Orth": orth, "Lex_Base": lex_base, "Ctag": ctag})

    test_corpus = []
    c = ""
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            text = extract_text(tok)
            test_corpus.append(text)
            c += text

    file_path = "../Code/test_corpus.txt"
    with open(file_path, "w", encoding="utf-8") as file:
        for text in test_corpus:
            file.write(text + "\n")

    data = []
    for chunk in root.findall('.//chunk'):
        for tok in chunk.findall('.//tok'):
            data.append(extract_info_for_df(tok))

    df = pd.DataFrame(data)

    df['SplitCTag'] = df['CTag'].apply(split_ctag)

    split_ctags_flat = [tag for sublist in df['SplitCTag'] for tag in sublist]

    num_pos_labels = len(set(split_ctags_flat))

    print(f'There are {num_pos_labels} classes (POS labels).')

    labels = df['CTag']
    labels.to_csv("test_labels.csv", index=False)

    df.to_csv("test_dataframe.csv")
    return test_data

# Function to process test data for different models
def process_test_data(test_data, window_size, output_file):
    if window_size == "baseline":
        test_df = baseline_train_df(test_data)
    elif window_size == 9:
        test_df = improved_9_train_df(test_data)
    elif window_size == 5:
        test_df = improved_5_train_df(test_data)
    elif window_size == 3:
        test_df = improved_3_train_df(test_data)
    else:
        print("Invalid window size.")
        return None

    test_df.to_csv(output_file, index=False)
    return test_df

# Process XML data
xml_gz_file = "../Data/test-1-1.xml.gz"
test_data = process_xml(xml_gz_file)

# Process test data for different models
baseline_test_df = process_test_data(test_data, "baseline", "test.csv")
improved_9_test_df = process_test_data(test_data, 9, "test_9.csv")
improved_5_test_df = process_test_data(test_data, 5, "test_5.csv")
improved_3_test_df = process_test_data(test_data, 3, "test_3.csv")


27879758
There are 71 classes (POS labels).
