<a href="https://colab.research.google.com/github/niharikasingh3632/Mental-Health-Counseling-Summarization/blob/main/Processing_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [12]:
!pip install spacy inflect
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [18]:
import os
import pandas as pd
import spacy
import inflect

# Load NLP tools
nlp = spacy.load("en_core_web_sm")
inflect_engine = inflect.engine()

def adjust_utterance(text, role):
    doc = nlp(text)
    new_tokens = []
    skip_next = False

    for i, token in enumerate(doc):
        if skip_next:
            skip_next = False
            continue

        if token.text.lower() in ["i'm"]:
            new_tokens.append(role)
            continue

        if token.text.lower() == "i":
            new_tokens.append(role)
            if i + 1 < len(doc):
                next_token = doc[i + 1]
                if next_token.tag_ == "VBP":
                    corrected = inflect_engine.plural_verb(next_token.text)
                    new_tokens.append(corrected)
                    skip_next = True
            continue

        new_tokens.append(token.text)

    return " ".join(new_tokens)

def preprocess_csv_files(input_path, output_path, train_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_file_names = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)
        try:
            df = pd.read_csv(file_path, encoding="utf-8")
            df.columns = df.columns.str.strip()

            if len(df) >= 3:
                metadata_df = df.iloc[-3:].copy()
                dialogue_df = df.iloc[:-3].copy()
            else:
                metadata_df = pd.DataFrame()
                dialogue_df = df.copy()

            if "Sub topic" in dialogue_df.columns:
                original_len = len(dialogue_df)
                dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]
                removed = original_len - len(dialogue_df)
                print(f"{removed} 'inactive' rows removed in {filename}")

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue rows to process in: {filename}")
                continue

            if "Utterance" not in dialogue_df.columns or "Type" not in dialogue_df.columns:
                raise KeyError(f"Required columns missing in file: {filename}")

            def process_row(row):
                utterance = str(row["Utterance"])
                role = "Patient" if row["Type"] == "P" else "Therapist"
                return adjust_utterance(utterance, role)

            dialogue_df["Utterance"] = dialogue_df.apply(process_row, axis=1)

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)

            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)

            if os.path.exists(output_file):
                print(f"Saved: {output_file}")
                processed_file_names.append(os.path.splitext(filename)[0])
            else:
                print(f"Failed to save file: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    # Write processed filenames
    with open(train_file_path, "w") as f:
        for name in processed_file_names:
            f.write(name + "\n")

    print(f"\n Done! {len(processed_file_names)} files saved to: {output_path}")
    print(f" Log saved to: {train_file_path}")


In [None]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Train"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/train.txt"

preprocess_csv_files(input_path, output_path, train)


In [None]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Validation"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/val.txt"

preprocess_csv_files(input_path, output_path, train)


In [None]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Test"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/test.txt"

preprocess_csv_files(input_path, output_path, train)
