<a href="https://colab.research.google.com/github/niharikasingh3632/Mental-Health-Counseling-Summarization/blob/main/Processing_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
import os
import pandas as pd
import re

def preprocess_csv_files(input_path, output_path, train_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_file_names = []

    for filename in os.listdir(input_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_path, filename)

            try:
                df = pd.read_csv(file_path, encoding="utf-8")
                df.columns = df.columns.str.strip()

                required_columns = {"Utterance", "Type"}
                missing_columns = required_columns - set(df.columns)
                if missing_columns:
                    raise KeyError(f"Missing columns in {filename}: {', '.join(missing_columns)}")

                dialogue_df = df.iloc[:-3].copy()
                metadata_df = df.iloc[-3:].copy()

                def replace_I(row):
                    role = "Patient" if row["Type"] == "P" else "Therapist"
                    return re.sub(r'\bI\b', role, str(row["Utterance"]))

                dialogue_df["Utterance"] = dialogue_df.apply(replace_I, axis=1)

                processed_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)

                output_csv_path = os.path.join(output_path, filename)
                processed_df.to_csv(output_csv_path, index=False)

                # Save filename (without extension) to the list
                processed_file_names.append(os.path.splitext(filename)[0])
                print(f" Processed: {filename}")

            except Exception as e:
                raise RuntimeError(f"Failed to process file '{filename}': {e}")

    # Save processed file names to train.txt
    with open(train_file_path, "w") as f:
        for name in processed_file_names:
            f.write(name + "\n")

    print(f"\n All done! Saved {len(processed_file_names)} processed files.")
    print(f"File list written to: {train_file_path}")


In [7]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Train"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/train.txt"

preprocess_csv_files(input_path, output_path, train)


 Processed: 34.csv
 Processed: 48.csv
 Processed: 66.csv
 Processed: 107.csv
 Processed: 109.csv
 Processed: 110.csv
 Processed: 111.csv
 Processed: 1.csv
 Processed: 3.csv
 Processed: 5.csv
 Processed: 6.csv
 Processed: 7.csv
 Processed: 8.csv
 Processed: 10.csv
 Processed: 11.csv
 Processed: 12.csv
 Processed: 13.csv
 Processed: 14.csv
 Processed: 15.csv
 Processed: 17.csv
 Processed: 18.csv
 Processed: 20.csv
 Processed: 21.csv
 Processed: 22.csv
 Processed: 24.csv
 Processed: 25.csv
 Processed: 26.csv
 Processed: 27.csv
 Processed: 28.csv
 Processed: 29.csv
 Processed: 30.csv
 Processed: 32.csv
 Processed: 33.csv
 Processed: 35.csv
 Processed: 36.csv
 Processed: 37.csv
 Processed: 42.csv
 Processed: 43.csv
 Processed: 44.csv
 Processed: 45.csv
 Processed: 47.csv
 Processed: 49.csv
 Processed: 50.csv
 Processed: 53.csv
 Processed: 54.csv
 Processed: 55.csv
 Processed: 56.csv
 Processed: 57.csv
 Processed: 58.csv
 Processed: 60.csv
 Processed: 61.csv
 Processed: 63.csv
 Processed: 64

In [8]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Validation"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/val.txt"

preprocess_csv_files(input_path, output_path, train)


 Processed: 16.csv
 Processed: 19.csv
 Processed: 40.csv
 Processed: 46.csv
 Processed: 69.csv
 Processed: 72.csv
 Processed: 75.csv
 Processed: 81.csv
 Processed: 102.csv
 Processed: 116.csv
 Processed: 71.csv
 Processed: 126.csv
 Processed: 127.csv
 Processed: 133.csv
 Processed: 148.csv
 Processed: 153.csv
 Processed: 154.csv
 Processed: 157.csv
 Processed: 198.csv
 Processed: 201.csv
 Processed: 214.csv

 All done! Saved 21 processed files.
File list written to: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/val.txt


In [9]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Test"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/test.txt"

preprocess_csv_files(input_path, output_path, train)


 Processed: 2.csv
 Processed: 4.csv
 Processed: 9.csv
 Processed: 23.csv
 Processed: 31.csv
 Processed: 38.csv
 Processed: 39.csv
 Processed: 41.csv
 Processed: 51.csv
 Processed: 52.csv
 Processed: 59.csv
 Processed: 79.csv
 Processed: 82.csv
 Processed: 86.csv
 Processed: 92.csv
 Processed: 97.csv
 Processed: 100.csv
 Processed: 104.csv
 Processed: 105.csv
 Processed: 108.csv
 Processed: 114.csv
 Processed: 124.csv
 Processed: 129.csv
 Processed: 132.csv
 Processed: 136.csv
 Processed: 137.csv
 Processed: 138.csv
 Processed: 142.csv
 Processed: 144.csv
 Processed: 145.csv
 Processed: 158.csv
 Processed: 159.csv
 Processed: 172.csv
 Processed: 176.csv
 Processed: 185.csv
 Processed: 186.csv
 Processed: 189.csv
 Processed: 191.csv
 Processed: 166.csv

 All done! Saved 39 processed files.
File list written to: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/test.txt
