<a href="https://colab.research.google.com/github/niharikasingh3632/Mental-Health-Counseling-Summarization/blob/main/Processing_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [12]:
!pip install spacy inflect
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [36]:
import os
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Label mapping
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# Replaces "you" and "your" based on speaker role
def replace_you_and_your(text: str, role: str) -> str:
    if role == "Patient":
        other = "therapist"
    else:
        other = "patient"

    text = re.sub(r"\byou're\b", f"{other} is", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou'd\b", f"{other} had", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou've\b", f"{other} has", text, flags=re.IGNORECASE)
    text = re.sub(r"\byour\b", f"{other}'s", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou\b", other, text, flags=re.IGNORECASE)

    return text

# Sentiment analysis
def get_sentiment(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return "neutral"
    try:
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
        output = model(**encoded_input)
        scores = torch.nn.functional.softmax(output.logits, dim=1)
        predicted_label = torch.argmax(scores).item()
        return label_map[predicted_label]
    except Exception:
        return "neutral"

# Preprocess CSV files
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue rows in {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            def process_row(row):
                utterance = str(row["Utterance"])
                role = "Patient" if row["Type"] == "P" else "Therapist"

                # Transform text
                transformed = grammar_replace(utterance, role)
                transformed = replace_you_and_your(transformed, role)
                transformed = transformed.lower()

                # Get sentiment of original utterance (optional: could also analyze transformed)
                sentiment = get_sentiment(utterance)

                return pd.Series([transformed, sentiment])

            dialogue_df[["Utterance", "Sentiment"]] = dialogue_df.apply(process_row, axis=1)

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\n Done! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


In [53]:
import os
import re
import pandas as pd

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# Replaces "you" and "your" based on speaker role
def replace_you_and_your(text: str, role: str) -> str:
    if role == "Patient":
        other = "therapist"
    else:
        other = "patient"

    text = re.sub(r"\byou're\b", f"{other} is", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou'd\b", f"{other} had", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou've\b", f"{other} has", text, flags=re.IGNORECASE)
    text = re.sub(r"\byour\b", f"{other}'s", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou\b", other, text, flags=re.IGNORECASE)

    return text

# Preprocess CSV files
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue rows in {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            def process_row(row):
                utterance = str(row["Utterance"])
                role = "Patient" if row["Type"] == "P" else "Therapist"

                transformed = grammar_replace(utterance, role)
                transformed = replace_you_and_your(transformed, role)
                transformed = transformed.lower()

                return transformed

            dialogue_df["Utterance"] = dialogue_df.apply(lambda row: process_row(row), axis=1)

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f" Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\n Done! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


In [54]:

input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Train"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/train.txt"

preprocess_csv_files(input_path, output_path, train)


 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/34.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/48.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/66.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/107.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/109.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/110.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/111.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/1.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/3.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/5.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/6.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/7.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training/8.csv
 Saved: /content/drive/MyDr

In [55]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Validation"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/val.txt"

preprocess_csv_files(input_path, output_path, train)


 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/16.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/19.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/40.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/46.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/69.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/72.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/75.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/81.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/102.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/116.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/71.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/126.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating/127

In [56]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Test"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/test.txt"

preprocess_csv_files(input_path, output_path, train)


 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/2.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/4.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/9.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/23.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/31.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/38.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/39.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/41.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/51.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/52.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/59.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/79.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing/82.csv
 Saved: /content/drive/MyDrive/MEMO_KDD_2

In [43]:
import os
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Label mapping
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# Verb conjugation for simple subject-verb agreement
def conjugate_verb(verb: str) -> str:
    if verb.endswith('y') and len(verb) > 1 and verb[-2] not in 'aeiou':
        return verb[:-1] + 'ies'
    elif verb.endswith(('s', 'sh', 'ch', 'x', 'z', 'o')):
        return verb + 'es'
    else:
        return verb + 's'

# Sentiment analysis
def get_sentiment(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return "neutral"
    try:
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
        output = model(**encoded_input)
        scores = torch.nn.functional.softmax(output.logits, dim=1)
        predicted_label = torch.argmax(scores).item()
        return label_map[predicted_label]
    except Exception:
        return "neutral"

# Preprocess CSV files
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            # Split metadata
            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            # Filter inactive rows
            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue rows in {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            def process_row(row):
                utterance = str(row["Utterance"])
                role = "Patient" if row["Type"] == "P" else "Therapist"
                transformed = grammar_replace(utterance, role)
                sentiment = get_sentiment(utterance)
                return pd.Series([transformed, sentiment])

            dialogue_df[["Utterance", "Sentiment"]] = dialogue_df.apply(process_row, axis=1)

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    # Save log
    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\n Done! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [48]:
import os
import re
import pandas as pd
from transformers import pipeline

# Sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# You/Your replacement
def replace_you_and_your(text: str, role: str) -> str:
    if role == "Patient":
        other = "therapist"
    else:
        other = "patient"

    # Specific contractions first
    text = re.sub(r"\byou're\b", f"{other} is", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou'd\b", f"{other} had", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou've\b", f"{other} has", text, flags=re.IGNORECASE)

    # Then more general replacements
    text = re.sub(r"\byour\b", f"{other}'s", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou\b", other, text, flags=re.IGNORECASE)

    return text


# Verb conjugation helper
def conjugate_verb(verb: str) -> str:
    if verb.endswith('y') and len(verb) > 1 and verb[-2] not in 'aeiou':
        return verb[:-1] + 'ies'
    elif verb.endswith(('s', 'sh', 'ch', 'x', 'z', 'o')):
        return verb + 'es'
    else:
        return verb + 's'

# Main preprocessing function
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            # Filter out 'inactive' rows
            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue in: {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            # Process each row
            def process_row(row):
                utterance = str(row["Utterance"]).strip()
                role = "Patient" if row["Type"] == "P" else "Therapist"
                utterance = grammar_replace(utterance, role)
                utterance = replace_you_and_your(utterance, role)
                return utterance.lower()

            dialogue_df["Utterance"] = dialogue_df.apply(process_row, axis=1)

            # Add sentiment
            sentiments = sentiment_pipeline(dialogue_df["Utterance"].tolist())
            dialogue_df["Sentiment"] = [s["label"].lower() for s in sentiments]

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    # Save log file
    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\nDone! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [49]:

input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Train"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training"
train = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/train.txt"

preprocess_csv_files(input_path, output_path, train)


Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/34.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/48.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/66.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/107.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/109.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/110.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/111.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/1.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/3.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/5.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/6.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/7.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/tra

Token indices sequence length is longer than the specified maximum sequence length for this model (807 > 512). Running this sequence through the model will result in indexing errors


Error processing 26.csv: The size of tensor a (807) must match the size of tensor b (512) at non-singleton dimension 1
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/27.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/28.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/29.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/30.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/32.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/33.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/35.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/36.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/37.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/42.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training/43.csv
Saved: /content/drive/M

In [50]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Validation"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating"
train = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/val.txt"

preprocess_csv_files(input_path, output_path, train)


Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/16.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/19.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/40.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/46.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/69.csv
Error processing 72.csv: The size of tensor a (732) must match the size of tensor b (512) at non-singleton dimension 1
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/75.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/81.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/102.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/116.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/71.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating/126.cs

In [51]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Test"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing"
train = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/test.txt"

preprocess_csv_files(input_path, output_path, train)


Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/2.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/4.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/9.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/23.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/31.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/38.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/39.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/41.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/51.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/52.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/59.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/79.csv
Saved: /content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing/82.csv
Sa