In [48]:
import xml.etree.ElementTree as ET
import os

# Function to parse moves XML file
def parse_moves_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    moves = {}
    for move in root.findall('.//move'):
        move_data = {
            'id': move.attrib.get('id'),
            'label': move.attrib.get('label', ''),
            'text': move.text.strip() if move.text else '',
            'children': []
        }
        for child in move.findall('.//nite:child', namespaces={'nite': 'http://nite.sourceforge.net/'}):
            tu_id = child.attrib['href'].split('#')[-1]
            if '..' in tu_id:
                start, end = tu_id.split('..')
                # print(start, end)
                start= start[3:-1]
                end =  end[3:-1]     
                move_data['children'].append(start)
                move_data['children'].append(end)
            else:
                tu_id = tu_id[3:-1] 
                move_data['children'].append(tu_id)
        moves[move_data['id']] = move_data
    return moves

def parse_transactions_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    transactions = []
    for transaction in root.findall('.//transaction'):
        transaction_data = {
            'type': transaction.get('type'),
            'id': transaction.get('id'),
            'map_start_point': transaction.get('map_start_point'),
            'map_end_point': transaction.get('map_end_point'),
            'children': []
        }
        for child in transaction.findall('.//nite:child', namespaces={'nite': 'http://nite.sourceforge.net/'}):
            move_id = child.attrib['href'].split('#')[-1]
            if '..' in move_id:
                start, end = move_id.split('..')
                # print(start, end)
                start= start[3:-1]
                end =  end[3:-1]     
                transaction_data['children'].append(start)
                transaction_data['children'].append(end)
            else:
                move_id = move_id[3:-1] 
                transaction_data['children'].append(move_id)
        transactions.append(transaction_data)
    return transactions


# Function to parse timed_units XML file
def parse_timed_units_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    timed_units = {}
    for tu in root.findall('.//tu'):
        tu_data = {
            'id': tu.attrib.get('id'),
            'text': tu.text.strip() if tu.text else ''
        }
        timed_units[tu_data['id']] = tu_data
    return timed_units

In [49]:

# Parse moves XML
moves = parse_moves_xml("/home/pastav/TA/maptaskv2-1/Data/moves/q1ec1.f.moves.xml")
print("moves:",moves)
# Parse timed_units XML
timed_units = parse_timed_units_xml(os.path.join("/home/pastav/TA/maptaskv2-1/Data/timed-units/q1ec1.f.timed-units.xml"))
# print(timed_units)
# Parse transactions XML
transactions = parse_transactions_xml(os.path.join("/home/pastav/TA/maptaskv2-1/Data/transactions/q1ec1.transactions.xml"))
print("transactions:",transactions)

moves: {'q1ec1.f.move.3': {'id': 'q1ec1.f.move.3', 'label': 'acknowledge', 'text': '', 'children': ['q1ec1f.2']}, 'q1ec1.f.move.5': {'id': 'q1ec1.f.move.5', 'label': 'check', 'text': '', 'children': ['q1ec1f.8', 'q1ec1f.14']}, 'q1ec1.f.move.8': {'id': 'q1ec1.f.move.8', 'label': 'acknowledge', 'text': '', 'children': ['q1ec1f.17', 'q1ec1f.19']}, 'q1ec1.f.move.11': {'id': 'q1ec1.f.move.11', 'label': 'acknowledge', 'text': '', 'children': ['q1ec1f.25', 'q1ec1f.27']}, 'q1ec1.f.move.13': {'id': 'q1ec1.f.move.13', 'label': 'check', 'text': '', 'children': ['q1ec1f.29', 'q1ec1f.32']}, 'q1ec1.f.move.18': {'id': 'q1ec1.f.move.18', 'label': 'acknowledge', 'text': '', 'children': ['q1ec1f.37']}, 'q1ec1.f.move.21': {'id': 'q1ec1.f.move.21', 'label': 'acknowledge', 'text': '', 'children': ['q1ec1f.47']}, 'q1ec1.f.move.23': {'id': 'q1ec1.f.move.23', 'label': 'reply_y', 'text': '', 'children': ['q1ec1f.50', 'q1ec1f.52']}, 'q1ec1.f.move.25': {'id': 'q1ec1.f.move.25', 'label': 'acknowledge', 'text': ''

In [50]:

# Connect transactions to moves and moves to timed units
for transaction in transactions:
    print(transaction)
    for move_id in transaction['children']:
        print("move-id:",move_id)
        move = moves.get(move_id)
        print(move)
        if move:
            move['timed_units'] = [timed_units[child_id] for child_id in move['children'] if child_id in timed_units]
# print(move)
# Print the connected data
for transaction_idx, transaction in enumerate(transactions, 1):
    print(f"Transaction {transaction_idx}:")
    for move_id in transaction:
        move = moves.get(move_id)
        if move:
            print(f"Move ID: {move['id']}, Label: {move['label']}, Text: {move['text']}")
            print("Timed Units:")
            for tu in move['timed_units']:
                print(f"- ID: {tu['id']}, Text: {tu['text']}")
    # print()

{'type': 'normal', 'id': 'q1ec1.transaction.1', 'map_start_point': '1', 'map_end_point': '2', 'children': ['q1ec1.g.move.1', 'q1ec1.g.move.2', 'q1ec1.f.move.3', 'q1ec1.g.move.3.5', 'q1ec1.f.move.4.5', 'q1ec1.g.move.4', 'q1ec1.g.move.5.5', 'q1ec1.f.move.5', 'q1ec1.f.move.7.5', 'q1ec1.g.move.6', 'q1ec1.g.move.8.5', 'q1ec1.f.move.8']}
move-id: q1ec1.g.move.1
None
move-id: q1ec1.g.move.2
None
move-id: q1ec1.f.move.3
{'id': 'q1ec1.f.move.3', 'label': 'acknowledge', 'text': '', 'children': ['q1ec1f.2']}
move-id: q1ec1.g.move.3.5
None
move-id: q1ec1.f.move.4.5
None
move-id: q1ec1.g.move.4
None
move-id: q1ec1.g.move.5.5
None
move-id: q1ec1.f.move.5
{'id': 'q1ec1.f.move.5', 'label': 'check', 'text': '', 'children': ['q1ec1f.8', 'q1ec1f.14']}
move-id: q1ec1.f.move.7.5
None
move-id: q1ec1.g.move.6
None
move-id: q1ec1.g.move.8.5
None
move-id: q1ec1.f.move.8
{'id': 'q1ec1.f.move.8', 'label': 'acknowledge', 'text': '', 'children': ['q1ec1f.17', 'q1ec1f.19']}
{'type': 'normal', 'id': 'q1ec1.transacti

In [130]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Define the columns for the DataFrame
cols = ["tu_id", "start", "end", "type", "utterance", "text"]

# Directory containing XML files
xml_dir = "/home/pastav/TA/maptaskv2-1/Data/timed-units/"

# Directory to save CSV files
csv_path = "/home/pastav/TA/csv_dataset/timed_units/"

# Iterate over XML files in the directory
for xml_file in os.listdir(xml_dir):
    if xml_file.endswith(".xml"):
        # Parse the XML file
        tree = ET.parse(os.path.join(xml_dir, xml_file))
        root = tree.getroot()

        rows = []
        for child in root:
                tu_id = child.attrib["id"]
                start = child.attrib["start"]
                end = child.attrib["end"]
                type = child.attrib["type"] if "type" in child.attrib else ""
                utt = child.attrib["utt"] if "utt" in child.attrib else ""
                text = child.text.strip() if child.text else ""

                rows.append({"tu_id": tu_id, "start": start, "end": end, "type" :type, "utterance": utt, "text": text})

        # Create DataFrame
        df = pd.DataFrame(rows, columns=cols)

        # Save DataFrame to CSV
        csv_file_name = os.path.splitext(xml_file)[0] + ".csv"
        output_csv = os.path.join(csv_path, csv_file_name)
        df.to_csv(output_csv, index=False)

        print(f"CSV file '{output_csv}' has been created.")


CSV file '/home/pastav/TA/csv_dataset/timed_units/q2nc2.g.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q7nc4.g.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q3nc2.f.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q7nc8.f.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q2nc4.g.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q7ec4.f.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q4nc8.g.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q6nc8.f.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q3nc4.f.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q3ec8.f.timed-units.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/timed_units/q7ec6.g.timed-units.

In [111]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Define the columns for the DataFrame
cols = ["move_id", "label", "rep", "children"]

# Directory containing XML files
xml_dir = "/home/pastav/TA/maptaskv2-1/Data/moves/"

# Directory to save CSV files
csv_path = "/home/pastav/TA/csv_dataset/moves/"

# Iterate over XML files in the directory
for xml_file in os.listdir(xml_dir):
    if xml_file.endswith(".xml"):
        # Parse the XML file
        tree = ET.parse(os.path.join(xml_dir, xml_file))
        root = tree.getroot()

        rows = []
        for child in root:
                move_id = child.attrib["id"]
                label = child.attrib["label"] if "label" in child.attrib else ""
                rep = child.attrib["rep"] if "rep" in child.attrib else "none"
                children = [c.attrib["href"].split("#")[-1] for c in child.findall(".//nite:child", namespaces={"nite": "http://nite.sourceforge.net/"})]

                rows.append({"move_id": move_id, "label": label, "rep":rep, "children": ", ".join(children)})

        # Create DataFrame
        df = pd.DataFrame(rows, columns=cols)

        # Save DataFrame to CSV
        csv_file_name = os.path.splitext(xml_file)[0] + ".csv"
        output_csv = os.path.join(csv_path, csv_file_name)
        df.to_csv(output_csv, index=False)

        print(f"CSV file '{output_csv}' has been created.")


CSV file '/home/pastav/TA/csv_dataset/moves/q4ec4.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q3nc7.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q4ec1.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q3ec6.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q3nc8.g.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q7ec3.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q2nc6.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q8nc2.g.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q6nc1.g.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q1ec3.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q3ec5.f.moves.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/moves/q7nc6.f.moves.csv' has been created.
CSV file '/home/pastav/TA/cs

In [57]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Define the columns for the DataFrame
cols = ["transaction_id", "type", "map_start_point", "map_end_point", "children"]

# Directory containing XML files
xml_dir = "/home/pastav/TA/maptaskv2-1/Data/transactions/"

# Directory to save CSV files
csv_path = "/home/pastav/TA/csv_dataset/transactions/"

# Iterate over XML files in the directory
for xml_file in os.listdir(xml_dir):
    if xml_file.endswith(".xml"):
        # Parse the XML file
        tree = ET.parse(os.path.join(xml_dir, xml_file))
        root = tree.getroot()

        rows = []
        for child in root:
            if child.tag == "transaction":
                transaction_id = child.attrib["id"]
                type = child.attrib["type"]
                map_start_point = child.attrib.get("map_start_point", "")
                map_end_point = child.attrib.get("map_end_point", "")
                children = [c.attrib["href"].split("#")[-1] for c in child.findall(".//nite:child", namespaces={"nite": "http://nite.sourceforge.net/"})]

                rows.append({"transaction_id": transaction_id, "type": type, "map_start_point": map_start_point, "map_end_point": map_end_point, "children": ", ".join(children)})

        # Create DataFrame
        df = pd.DataFrame(rows, columns=cols)

        # Save DataFrame to CSV
        csv_file_name = os.path.splitext(xml_file)[0] + ".csv"
        output_csv = os.path.join(csv_path, csv_file_name)
        df.to_csv(output_csv, index=False)

        print(f"CSV file '{output_csv}' has been created.")


CSV file '/home/pastav/TA/csv_dataset/transactions/q7nc5.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q5nc4.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q1nc4.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q5ec1.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q6ec3.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q2nc7.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q1nc2.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q5ec3.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q4nc1.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q4nc3.transactions.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/transactions/q1nc6.transactions.

In [106]:
import pandas as pd

# Read the CSV file
transactions_df = pd.read_csv("/home/pastav/TA/csv_dataset/transactions/q1ec1.transactions.csv")

# Read the moves CSV file
moves_df = pd.read_csv("/home/pastav/TA/csv_dataset/moves/q1ec1.g.moves.csv")

# Loop over the children column
childlist=[]
combined_data = []

# Loop over the children column of the transactions DataFrame
for children_str in transactions_df['children']:
    transaction_data = {}
    children_list = [child.strip() for child in children_str.split(',')]
    for child in children_list:
        print(child)
        if '..' in child:
            start, end = child.split('..')
            start= start[3:-1]
            end =  end[3:-1]     
            # print(start)
            
            # Process start_data
            if start_data.empty:
                start_label_value = ""
                start_children_value = ""
            else:
                if start_data['children'].empty:
                    start_children_value = ""
                else:
                    start_children_value = start_data['children'].iloc[0]
            
                if start_data['label'].empty:
                    start_label_value = ""
                else:
                    start_label_value = start_data['label'].iloc[0]
            
            # Process end_data
            if end_data.empty:
                end_label_value = ""
                end_children_value = ""
            else:
                if end_data['children'].empty:
                    end_children_value = ""
                else:
                    end_children_value = end_data['children'].iloc[0]
            
                if end_data['label'].empty:
                    end_label_value = ""
                else:
                    end_label_value = end_data['label'].iloc[0]

            transaction_data.update({
                'label': start_label_value,
                'children': start_children_value
            })
            transaction_data.update({
                'label': end_label_value,
                'children': end_children_value
            })
            
        else:
            child_id = child[3:-1]
            # Extract data from moves DataFrame based on child ID
            child_data = moves_df[moves_df['move_id'] == child_id]
            if child_data.empty:
                label_value = ""
                children_value = ""
            else:
                if child_data['children'].empty:
                    children_value = ""
                else:
                    children_value = child_data['children'].iloc[0]
            
                if child_data['label'].empty:
                    label_value = ""
                else:
                    label_value = child_data['label'].iloc[0]
            
            transaction_data.update({
                'label': label_value,
                'children': children_value
            })

    combined_data.append(transaction_data)

# Create a new DataFrame with the combined data
combined_df = pd.DataFrame(combined_data)

# Merge the combined DataFrame with the transactions DataFrame
final_df = pd.concat([transactions_df, combined_df], axis=1)

# Drop the original 'children' column as it's no longer needed
# final_df.drop('children', axis=1, inplace=True)

# Print or save the final DataFrame as required
# print(final_df)
csv_path = "/home/pastav/TA/csv_dataset/final_data.csv"
final_df.to_csv(csv_path, index=False)



id(q1ec1.g.move.1)..id(q1ec1.g.move.2)
id(q1ec1.f.move.3)
id(q1ec1.g.move.3.5)
id(q1ec1.f.move.4.5)
id(q1ec1.g.move.4)..id(q1ec1.g.move.5.5)
id(q1ec1.f.move.5)..id(q1ec1.f.move.7.5)
id(q1ec1.g.move.6)..id(q1ec1.g.move.8.5)
id(q1ec1.f.move.8)
id(q1ec1.g.move.9)
id(q1ec1.f.move.10.5)
id(q1ec1.g.move.9.5)..id(q1ec1.g.move.11.5)
id(q1ec1.f.move.11)..id(q1ec1.f.move.12.5)
id(q1ec1.g.move.12)..id(q1ec1.g.move.13.5)
id(q1ec1.f.move.13)
id(q1ec1.g.move.14)..id(q1ec1.g.move.16)
id(q1ec1.g.move.17)..id(q1ec1.g.move.18.5)
id(q1ec1.f.move.18)..id(q1ec1.f.move.20.5)
id(q1ec1.g.move.19)..id(q1ec1.g.move.21.5)
id(q1ec1.f.move.21)..id(q1ec1.f.move.22.5)
id(q1ec1.g.move.22)
id(q1ec1.f.move.23)
id(q1ec1.g.move.23.5)
id(q1ec1.f.move.24.5)
id(q1ec1.g.move.24)
id(q1ec1.f.move.25)
id(q1ec1.g.move.26)..id(q1ec1.g.move.29.5)
id(q1ec1.f.move.27)..id(q1ec1.f.move.28.5)
id(q1ec1.g.move.30)..id(q1ec1.g.move.31.5)
id(q1ec1.f.move.29)..id(q1ec1.f.move.31)
id(q1ec1.g.move.32)
id(q1ec1.g.move.33)..id(q1ec1.g.move.34.

In [None]:
import pandas as pd
import re
import os
directory = "/home/pastav/TA/csv_dataset/transactions/"
transaction_files = [file for file in os.listdir(directory) if file.endswith(".csv")]

save_path = "/home/pastav/TA/dataset/"
devscore_df = pd.read_csv("/home/pastav/TA/devscore.csv")
# Initialize a list to store combined move and transaction data


for file in transaction_files:
    file_path = os.path.join(directory, file)
    combined_data = []
    # print(file)
    # if file != "q7ec1.transactions.csv":
    #     continue
    # Read the CSV file
    transactions_df = pd.read_csv(file_path).fillna("")

    for index, transaction_row in transactions_df.iterrows():
        transaction_id = transaction_row['transaction_id']
        file_name = transaction_id.split('.')[0]
        # if file_name != "q7ec1":
        #     continue
        transaction_type = transaction_row['type']
        map_start_point = transaction_row['map_start_point']
        map_end_point = transaction_row['map_end_point']
        moves_df_g_path = transaction_id.split('.')[0]+".g.moves.csv"
        moves_df_f_path = transaction_id.split('.')[0]+".f.moves.csv"
        timed_units_df_g_path = transaction_id.split('.')[0]+".g.timed-units.csv"
        timed_units_df_f_path = transaction_id.split('.')[0]+".f.timed-units.csv"
        moves_df_g_path = transaction_id.split('.')[0]+".g.moves.csv"
        gaze_f_path = transaction_id.split('.')[0]+".f.gaze.csv"
        gaze_g_path = transaction_id.split('.')[0]+".g.gaze.csv"
        try:
            gaze_f_df = pd.read_csv("/home/pastav/TA/csv_dataset/gaze/"+gaze_f_path).fillna("")
        except FileNotFoundError:
            gaze_f_df = pd.DataFrame(columns=['start', 'end', 'id', 'type'])
        try:
            gaze_g_df = pd.read_csv("/home/pastav/TA/csv_dataset/gaze/"+gaze_g_path).fillna("")
        except FileNotFoundError:
            gaze_g_df = pd.DataFrame(columns=['start', 'end', 'id', 'type'])
        moves_df_g = pd.read_csv("/home/pastav/TA/csv_dataset/moves/"+moves_df_g_path).fillna("")
        moves_df_f = pd.read_csv("/home/pastav/TA/csv_dataset/moves/"+moves_df_f_path).fillna("")
        timed_units_df_g =  pd.read_csv("/home/pastav/TA/csv_dataset/timed_units/"+timed_units_df_g_path).fillna("")
        timed_units_df_f =  pd.read_csv("/home/pastav/TA/csv_dataset/timed_units/"+timed_units_df_f_path).fillna("")
        children = re.findall(r"id\((.*?)\)", transaction_row['children'])
        for child in children:
            extracted_letter = child.split('.')[1]
            # print(extracted_letter)
            if extracted_letter == "g":
                moves_df = moves_df_g
                timed_units_df = timed_units_df_g
            elif extracted_letter == "f":
                moves_df = moves_df_f
                timed_units_df = timed_units_df_f
            else:
                print("incorrect child in transaction")
            children_texts = []
            # print(child)
            move_row = moves_df[moves_df['move_id'] == child]
            # print(move_row)
            move_id = move_row['move_id'].iloc[0]
            move_label = move_row['label'].iloc[0]
            move_children = move_row['children'].iloc[0]
            timed_unit_start = None  # Initialize timed unit start time
            timed_unit_end = None  # Initialize timed unit end time
            if ".." in move_children:
                start, end = move_children.split('..')
                # print(start, end)
                tu_id = start[3:-1].split('.')[0]
                start_num= int(start[3:-1].split('.')[-1])
                end_num =  int(end[3:-1].split('.')[-1])
                for i in range(start_num, end_num + 1):
                    child_id =  tu_id+"."+str(i)  # Extract child ID
                    # print(child_id)
                    child_text_row = timed_units_df[timed_units_df['tu_id'] == child_id]
                    # print(child_text_row)
                    if not child_text_row.empty:
                        child_text = child_text_row.iloc[0]['text']  # Get text for child
                        child_type = child_text_row.iloc[0]['type']  # Get type for child
                        
                        if pd.isna(child_text):
                            continue
                            # child_text = ""
                        
                        children_texts.append(child_text)
                        if timed_unit_start is None:
                            timed_unit_start = child_text_row.iloc[0]['start']
                        timed_unit_end = child_text_row.iloc[0]['end']
                    else:
                        children_texts.append("")
                        child_type = ""
            else:
                child_id = move_children[3:-1]
                child_text_row = timed_units_df[timed_units_df['tu_id'] == child_id]
                    # print(child_text_row)
                if not child_text_row.empty:
                    child_text = child_text_row.iloc[0]['text']  # Get text for child
                    child_type = child_text_row.iloc[0]['type']  # Get type for child
                    
                    if pd.isna(child_text):
                        child_text = ""
                    
                    children_texts.append(child_text)
                    if timed_unit_start is None:
                        timed_unit_start = child_text_row.iloc[0]['start']
                    timed_unit_end = child_text_row.iloc[0]['end']
                else:
                    children_texts.append("")
                    child_type = ""
            # print(children_texts)
            max_gaze_type_f = None
            max_gaze_duration_f = 0
            max_gaze_type_g = None
            max_gaze_duration_g = 0
            # print(timed_unit_start,timed_unit_end)
            # Filter gaze data for the current transaction's time range
            if timed_unit_start is not None and timed_unit_end is not None:
                timed_unit_start = float(timed_unit_start)
                timed_unit_end = float(timed_unit_end)

            # Filter gaze data for the current transaction's time range
                filtered_gaze_f = gaze_f_df[((gaze_f_df['start'] >= timed_unit_start) & (gaze_f_df['start'] <= timed_unit_end)) | ((gaze_f_df['end'] >= timed_unit_start) & (gaze_f_df['end'] <= timed_unit_end)) | ((gaze_f_df['start'] <= timed_unit_start) & (gaze_f_df['end'] >= timed_unit_end))]
                filtered_gaze_g = gaze_g_df[((gaze_g_df['start'] >= timed_unit_start) & (gaze_g_df['start'] <= timed_unit_end)) | ((gaze_g_df['end'] >= timed_unit_start) & (gaze_g_df['end'] <= timed_unit_end)) | ((gaze_g_df['start'] <= timed_unit_start) & (gaze_g_df['end'] >= timed_unit_end))]
                # print(timed_unit_start,timed_unit_end)
                # print("filtered for f: ",filtered_gaze_f)
                # print("filtered for g: ",filtered_gaze_g)
            
                # Calculate the total duration for each gaze type within the filtered gaze data
                for gaze_type in filtered_gaze_f['type'].unique():
                    total_duration = filtered_gaze_f[filtered_gaze_f['type'] == gaze_type]['end'].sum() - filtered_gaze_f[filtered_gaze_f['type'] == gaze_type]['start'].sum()
                    if total_duration > max_gaze_duration_f:
                        max_gaze_duration_f = total_duration
                        max_gaze_type_f = gaze_type

                for gaze_type in filtered_gaze_g['type'].unique():
                    total_duration = filtered_gaze_g[filtered_gaze_g['type'] == gaze_type]['end'].sum() - filtered_gaze_g[filtered_gaze_g['type'] == gaze_type]['start'].sum()
                    if total_duration > max_gaze_duration_g:
                        max_gaze_duration_g = total_duration
                        max_gaze_type_g = gaze_type
            
                # Extract the gaze type
                if max_gaze_type_f is None:
                    max_gaze_type_f = ""
                else:
                    max_gaze_type_f = max_gaze_type_f[3:-1].split("_")[-1]
                    
                if max_gaze_type_g is None:
                    max_gaze_type_g = ""
                else:
                    max_gaze_type_g = max_gaze_type_g[3:-1].split("_")[-1]
            else:
                # Set max_gaze_type to empty if timed_unit_start or timed_unit_end is None
                max_gaze_type_g = ""
                max_gaze_type_f = ""
            # print(max_gaze_type_f,max_gaze_type_g)

            children_texts = [text for text in children_texts if text.strip()]
            combined_text = " ".join(children_texts)
            tofind_devscore = move_id.split('.')[0]
            devscore_row = devscore_df[devscore_df['id'] == tofind_devscore]
            conversation_id = devscore_row['conversation'].iloc[0]
            eyecontact = devscore_row['eyecontact'].iloc[0]
            familiar = devscore_row['familiar'].iloc[0]
            map = devscore_row['map'].iloc[0]
            quad = devscore_row['quad'].iloc[0]
            devscore_value = devscore_row['devscore'].iloc[0]
            combined_data.append({
                'conversation_id': conversation_id,
                'transaction_id': transaction_id,
                'transaction_type': transaction_type,
                'move_id': move_id,
                'label': move_label,
                'map_start_point': map_start_point,
                'map_end_point': map_end_point,
                'timed_units': combined_text,
                'timed_unit_type': child_type,
                'timed_unit_start': timed_unit_start,  # Add start time
                'timed_unit_end': timed_unit_end,  # Add end time
                'eyecontact': eyecontact,
                'familiar': familiar,
                'map': map,
                'quad': quad,
                'devscore': devscore_value,
                'f_gaze_type': max_gaze_type_f,
                'g_gaze_type': max_gaze_type_g
            })
        # print(combined_data)
    combined_df = pd.DataFrame(combined_data)

    combined_df.to_csv(save_path+file_name+".csv", index=False)
    print("saved: ",file_name)


# Convert the list of dictionaries to a DataFrame
# combined_df = pd.DataFrame(combined_data)


# Write the DataFrame to a CSV file
# combined_df.to_csv('combined_moves_transactions.csv', index=False)



saved:  q2ec1
saved:  q5nc3
saved:  q7ec1
saved:  q6nc4
saved:  q3nc4
saved:  q2nc7
saved:  q3nc1
saved:  q5ec6
saved:  q7ec3
saved:  q3nc5
saved:  q8ec8
saved:  q8ec3
saved:  q1nc5
saved:  q7ec4
saved:  q1ec5
saved:  q5nc4
saved:  q5nc7
saved:  q7nc5
saved:  q6nc8
saved:  q6ec1
saved:  q7ec5
saved:  q1ec7
saved:  q4ec8
saved:  q5ec5
saved:  q6ec5
saved:  q1ec3
saved:  q8ec6
saved:  q5ec7
saved:  q4ec5
saved:  q4ec1
saved:  q1ec6
saved:  q3ec8
saved:  q2ec4
saved:  q3ec3
saved:  q6nc2
saved:  q2nc4
saved:  q2nc6
saved:  q2nc2
saved:  q5nc5
saved:  q2ec7
saved:  q4nc8
saved:  q6nc1
saved:  q7ec8
saved:  q5ec3
saved:  q7nc7
saved:  q6nc3
saved:  q3ec6
saved:  q1nc4
saved:  q8nc1
saved:  q6ec7
saved:  q7nc6
saved:  q6ec6
saved:  q6nc7
saved:  q1nc2
saved:  q8ec5
saved:  q8nc2
saved:  q7nc8
saved:  q2nc1
saved:  q4ec4
saved:  q4ec3
saved:  q8nc6
saved:  q2ec6
saved:  q1ec2
saved:  q8nc7
saved:  q5ec1
saved:  q6nc6
saved:  q6ec3


In [149]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

#Define the columns for the DataFrame
cols = ["conversation", "eyecontact", "familiar", "id", "map", "quad", "devscore"]
# Directory to save CSV files
csv_file_path = "/home/pastav/TA/csv_dataset/"
file_path = '/home/pastav/TA/maptaskv2-1/Data/corpus-resources/maptask-corpus.xml'  # Update with your file path

tree = ET.parse(os.path.join(file_path))
root = tree.getroot()
rows = []
for child in root:
    row = {}
    for col in cols:
        row[col] = child.attrib.get(col, "")
    rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows, columns=cols)

csv_file_path = 'devscore.csv'  # Update with desired file path
df.to_csv(csv_file_path, index=False)



In [63]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Define the columns for the DataFrame
cols = ["start", "end", "id", "type"]

# Directory containing XML files
xml_dir = "/home/pastav/TA/maptaskv2-1/Data/gaze/"

# Directory to save CSV files
csv_path = "/home/pastav/TA/csv_dataset/gaze/"

# Namespace mapping
namespace = {"nite": "http://nite.sourceforge.net/"}

# Iterate over XML files in the directory
for xml_file in os.listdir(xml_dir):
    if xml_file.endswith(".xml"):
        # Parse the XML file
        tree = ET.parse(os.path.join(xml_dir, xml_file))
        root = tree.getroot()
        # print(root)
        rows = []
        for look in root.findall(".//look"):
            # print(look)
            look_id = look.attrib["id"]
            start = look.attrib["start"]
            end = look.attrib["end"]
            type_elem = look.find(".//nite:pointer[@role='type']", namespace)
            type_value = type_elem.attrib["href"].split("#")[-1] if type_elem is not None else ""

            rows.append({"start": start, "end": end, "id": look_id, "type": type_value})

        # Create DataFrame
        df = pd.DataFrame(rows, columns=cols)

        # Save DataFrame to CSV
        csv_file_name = os.path.splitext(xml_file)[0] + ".csv"
        output_csv = os.path.join(csv_path, csv_file_name)
        df.to_csv(output_csv, index=False)

        print(f"CSV file '{output_csv}' has been created.")


CSV file '/home/pastav/TA/csv_dataset/gaze/q3nc4.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q4ec1.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q8ec1.f.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q3nc6.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q8ec2.f.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q3ec5.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q7ec1.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q8ec4.f.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q3nc6.f.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q8ec4.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q3ec8.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q3ec7.g.gaze.csv' has been created.
CSV file '/home/pastav/TA/csv_dataset/gaze/q4ec5.g.g