# Prepare Šolar corpus link data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd

from utils.logging import get_logger
from solar_prepare_data import read_solar, SOLAR_DIRECTORY


In [None]:
# Get logger
solar_link_logger = get_logger("Prepare Solar Corpus Link Data")


In [None]:
def get_solar_link():
    """
    Returns the links between source and target data of solar corpus as a pandas
    data frame.

    @return: data frame with links between source and target data of solar corpus
    """
    # Read solar corpus data
    data_xml = read_solar()
    data = []

    for index_i in range(len(data_xml[2])):
        for index_j in range(len(data_xml[2][index_i])):
            temp_data = data_xml[2][index_i][index_j].get("target").split()
            # Remove # from solar word id
            temp_data_source = ";".join(
                [source[1:] for source in temp_data if "s." in source]
            )
            temp_data_target = ";".join(
                [source[1:] for source in temp_data if "t." in source]
            )

            data.append(
                [
                    data_xml[2][index_i][index_j].get("type"),
                    temp_data_source,
                    temp_data_target,
                ]
            )

    df_columns = ["type", "source", "target"]
    df_rows = data

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_solar_link():
    """
    Saves the solar links between source and target as a data frame to a csv
    file.
    """
    data = get_solar_link()

    data.to_csv(SOLAR_DIRECTORY + "solar_link.csv", index=False)
    solar_link_logger.info("Solar links saved to csv file")

    return
