[![Open in Colab][colab-badge]][extractor-notebook]

<!-- links -->
[colab-badge]: https://colab.research.google.com/assets/colab-badge.svg
[extractor-notebook]: https://colab.research.google.com/github/patricia-ternes/LMS_annotations_extractor/extractor.ipynb

## Variables Definition

Please fill the necessary information bellow.

Note: 
- all the values should be placed between **quotation marks**
- talker_ID must have **2 digits**
- session_ID always **starts with S**
- talker_gender can be **F**, **M** or **N**

In [9]:
talker_ID = "02"
session_ID = "S1"
talker_gender = "M"

corpus = "LMS"
task_IDs = ["WOR", "SNO", "SFA", "SCL"]

task_names = [
    "word",
    "sentence_normal",
    "sentence_fast",
    "sentence_clear",
]

In [None]:
from google.colab import files
uploaded = files.upload()

*Note: Uploaded files are stored temporarily in the `/content/` folder*

## Main Code

In [10]:
import pandas as pd
import os
import io
import shutil

input_name = list(uploaded.keys())[0]  # automatically extract input name 

# Try to create a temporary folder to store the outputs
output_path = "/content/processed/"
try: os.makedirs(output_path)
except OSError: pass

# Load necessary columns from input csv and store as pandas.DataFrame
columns = ["task_name", "item_text"]
data = pd.read_csv(io.BytesIO(uploaded[input_name]), usecols=columns)

# Repeat the following for every task
for i in range(len(task_IDs)):
    task_df = data[data["task_name"] == task_names[i]]  # selects rows according to the task
    task_df = task_df["item_text"].dropna()  # drop empty rows (first row for every task)
    task_df.index = task_df.reset_index(drop=True).index + 1  # reset index, starting at 1
    task_df = task_df.index.astype(str) + ". " + task_df  # add index to text

    # Define output name, based in variables
    output_name = "_".join(
        [corpus, task_IDs[i], talker_ID, talker_gender, session_ID + ".txt"]
    )

    # Define output path + name
    output = os.path.join(output_path, output_name)
    
    # Temporarily save the output
    task_df.to_csv(
        output, sep="\t", header=False, index=False, quoting=3, escapechar="#"
    )

    # Download the output
    files.download(output)

In [None]:
# os.remove(input_name)
# shutil.rmtree(output_path)