In [9]:
import pandas as pd
import os
import numpy as np

def transform_excel_to_text_and_split(excel_path, output_folder):
    # Ensure openpyxl is installed: pip install openpyxl
    # Read the Excel file with openpyxl engine
    excel_data = pd.read_excel(excel_path, engine='openpyxl')

    # Replace NaN with 'Null'
    excel_data.fillna('Null', inplace=True)

    # Transform the data
    transformed_data = []
    for review, group in excel_data.groupby('Review'):
        # Deduplicate annotations
        annotations = set(group.apply(
            lambda x: tuple([x['Aspect'], x['Category'], x['Sentiment towards aspect'], x.get('Explicit Opinion phrase \n(adj)', 'Null') or x.get('Implicit Opinion', 'Null')]),
            axis=1
        ).tolist())
        # Convert tuples back to lists for consistency, if necessary
        annotations = [list(annotation) for annotation in annotations]
        review_formatted = f"{review}####{annotations}"
        transformed_data.append(review_formatted)

    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Shuffle the transformed data randomly for splitting
    np.random.seed(42)  # For reproducibility
    transformed_data = np.random.permutation(transformed_data)

    # Calculate split indices
    total_length = len(transformed_data)
    train_end = int(total_length * 0.7)
    dev_end = train_end + int(total_length * 0.1)

    # Split the data
    train_data = transformed_data[:train_end]
    dev_data = transformed_data[train_end:dev_end]
    test_data = transformed_data[dev_end:]

    # Write each split to its corresponding file
    splits = {'train.txt': train_data, 'dev.txt': dev_data, 'test.txt': test_data}
    for filename, data in splits.items():
        filepath = os.path.join(output_folder, filename)
        with open(filepath, 'w', encoding='utf-8') as file:
            for item in data:
                file.write(f"{item}\n")

# Example usage
excel_path = r'C:\Users\s4658894\Proj2Exp\Convert data\Data.xlsx'
output_folder = r'C:\Users\s4658894\Proj2Exp\Convert data\MyData'
transform_excel_to_text_and_split(excel_path, output_folder)


FileNotFoundError: [Errno 2] No such file or directory: 'P4.xlsx'