<a href="https://colab.research.google.com/github/niharikasingh3632/Mental-Health-Counseling-Summarization/blob/main/Processing_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install spacy inflect
!python -m spacy download en_core_web_sm


In [None]:
import os
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Label mapping
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# Replaces "you" and "your" based on speaker role
def replace_you_and_your(text: str, role: str) -> str:
    if role == "Patient":
        other = "therapist"
    else:
        other = "patient"

    text = re.sub(r"\byou're\b", f"{other} is", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou'd\b", f"{other} had", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou've\b", f"{other} has", text, flags=re.IGNORECASE)
    text = re.sub(r"\byour\b", f"{other}'s", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou\b", other, text, flags=re.IGNORECASE)

    return text

# Sentiment analysis
def get_sentiment(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return "neutral"
    try:
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
        output = model(**encoded_input)
        scores = torch.nn.functional.softmax(output.logits, dim=1)
        predicted_label = torch.argmax(scores).item()
        return label_map[predicted_label]
    except Exception:
        return "neutral"

# Preprocess CSV files
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue rows in {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            def process_row(row):
                utterance = str(row["Utterance"])
                role = "Patient" if row["Type"] == "P" else "Therapist"

                # Transform text
                transformed = grammar_replace(utterance, role)
                transformed = replace_you_and_your(transformed, role)
                transformed = transformed.lower()

                # Get sentiment of original utterance (optional: could also analyze transformed)
                sentiment = get_sentiment(utterance)

                return pd.Series([transformed, sentiment])

            dialogue_df[["Utterance", "Sentiment"]] = dialogue_df.apply(process_row, axis=1)

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\n Done! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


In [None]:
import os
import re
import pandas as pd

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# Replaces "you" and "your" based on speaker role
def replace_you_and_your(text: str, role: str) -> str:
    if role == "Patient":
        other = "therapist"
    else:
        other = "patient"

    text = re.sub(r"\byou're\b", f"{other} is", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou'd\b", f"{other} had", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou've\b", f"{other} has", text, flags=re.IGNORECASE)
    text = re.sub(r"\byour\b", f"{other}'s", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou\b", other, text, flags=re.IGNORECASE)

    return text

# Preprocess CSV files
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue rows in {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            def process_row(row):
                utterance = str(row["Utterance"])
                role = "Patient" if row["Type"] == "P" else "Therapist"

                transformed = grammar_replace(utterance, role)
                transformed = replace_you_and_your(transformed, role)
                transformed = transformed.lower()

                return transformed

            dialogue_df["Utterance"] = dialogue_df.apply(lambda row: process_row(row), axis=1)

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f" Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\n Done! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


In [None]:

input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Train"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/train.txt"

preprocess_csv_files(input_path, output_path, train)


In [None]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Validation"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/val.txt"

preprocess_csv_files(input_path, output_path, train)


In [None]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Test"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing"
train = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/test.txt"

preprocess_csv_files(input_path, output_path, train)


In [None]:
import os
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Label mapping
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# Verb conjugation for simple subject-verb agreement
def conjugate_verb(verb: str) -> str:
    if verb.endswith('y') and len(verb) > 1 and verb[-2] not in 'aeiou':
        return verb[:-1] + 'ies'
    elif verb.endswith(('s', 'sh', 'ch', 'x', 'z', 'o')):
        return verb + 'es'
    else:
        return verb + 's'

# Sentiment analysis
def get_sentiment(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return "neutral"
    try:
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
        output = model(**encoded_input)
        scores = torch.nn.functional.softmax(output.logits, dim=1)
        predicted_label = torch.argmax(scores).item()
        return label_map[predicted_label]
    except Exception:
        return "neutral"

# Preprocess CSV files
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            # Split metadata
            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            # Filter inactive rows
            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue rows in {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            def process_row(row):
                utterance = str(row["Utterance"])
                role = "Patient" if row["Type"] == "P" else "Therapist"
                transformed = grammar_replace(utterance, role)
                sentiment = get_sentiment(utterance)
                return pd.Series([transformed, sentiment])

            dialogue_df[["Utterance", "Sentiment"]] = dialogue_df.apply(process_row, axis=1)

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    # Save log
    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\n Done! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


In [None]:
import os
import re
import pandas as pd
from transformers import pipeline

# Sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Grammar transformation using regex
def grammar_replace(text: str, role: str) -> str:
    if not isinstance(text, str):
        return text

    replacements = [
        (r"\b[Ii]['’`]?m not\b", f"{role} is not"),
        (r"\b[Ii]['’`]?m\b", f"{role} is"),
        (r"\b[Ii]\s+am\b", f"{role} is"),
        (r"\b[Ii]\s+have\b", f"{role} has"),
        (r"\b[Ii]\s+haven't\b", f"{role} has not"),
        (r"\b[Ii]\s+was\b", f"{role} was"),
        (r"\b[Ii]\s+do\b", f"{role} does"),
        (r"\b[Ii]\s+don't\b", f"{role} doesn't"),
        (r"\b[Ii]\s+can\b", f"{role} can"),
        (r"\b[Ii]\s+will\b", f"{role} will"),
        (r"\b[Ii]['’]?ve\b", f"{role} has"),
        (r"\b[Ii]['’]?d\b", f"{role} would"),
        (r"\b[Ii]['’]?ll\b", f"{role} will"),
        (r"\b[Mm]yself\b", f"{role} himself"),
        (r"\b[Mm]y\b", f"{role}'s"),
        (r"\b[Mm]ine\b", f"{role}'s"),
        (r"\b[Mm]e\b", f"{role}"),
        (r"\b[Ii]\b", f"{role}"),
    ]

    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text)

    return text

# You/Your replacement
def replace_you_and_your(text: str, role: str) -> str:
    if role == "Patient":
        other = "therapist"
    else:
        other = "patient"

    # Specific contractions first
    text = re.sub(r"\byou're\b", f"{other} is", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou'd\b", f"{other} had", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou've\b", f"{other} has", text, flags=re.IGNORECASE)

    # Then more general replacements
    text = re.sub(r"\byour\b", f"{other}'s", text, flags=re.IGNORECASE)
    text = re.sub(r"\byou\b", other, text, flags=re.IGNORECASE)

    return text


# Verb conjugation helper
def conjugate_verb(verb: str) -> str:
    if verb.endswith('y') and len(verb) > 1 and verb[-2] not in 'aeiou':
        return verb[:-1] + 'ies'
    elif verb.endswith(('s', 'sh', 'ch', 'x', 'z', 'o')):
        return verb + 'es'
    else:
        return verb + 's'

# Main preprocessing function
def preprocess_csv_files(input_path, output_path, log_file_path):
    os.makedirs(output_path, exist_ok=True)
    processed_files = []

    for filename in os.listdir(input_path):
        if not filename.endswith(".csv"):
            continue

        file_path = os.path.join(input_path, filename)

        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep=None, engine='python')
            df.columns = df.columns.str.strip()

            if "Utterance" not in df.columns or "Sub topic" not in df.columns:
                raise KeyError(f"Missing required columns in: {filename}")

            metadata_df = df.iloc[-3:].copy() if len(df) >= 3 else pd.DataFrame()
            dialogue_df = df.iloc[:-3].copy() if len(df) >= 3 else df.copy()

            # Filter out 'inactive' rows
            dialogue_df = dialogue_df[~dialogue_df["Sub topic"].str.lower().eq("inactive")]

            if dialogue_df.empty:
                print(f"⚠️ No valid dialogue in: {filename}")
                continue

            if "Type" not in dialogue_df.columns:
                raise KeyError(f"Missing 'Type' column in: {filename}")

            # Process each row
            def process_row(row):
                utterance = str(row["Utterance"]).strip()
                role = "Patient" if row["Type"] == "P" else "Therapist"
                utterance = grammar_replace(utterance, role)
                utterance = replace_you_and_your(utterance, role)
                return utterance.lower()

            dialogue_df["Utterance"] = dialogue_df.apply(process_row, axis=1)

            # Add sentiment
            sentiments = sentiment_pipeline(dialogue_df["Utterance"].tolist())
            dialogue_df["Sentiment"] = [s["label"].lower() for s in sentiments]

            final_df = pd.concat([dialogue_df, metadata_df], ignore_index=True)
            output_file = os.path.join(output_path, filename)
            final_df.to_csv(output_file, index=False)
            processed_files.append(os.path.splitext(filename)[0])
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    # Save log file
    with open(log_file_path, "w") as f:
        for name in processed_files:
            f.write(name + "\n")

    print(f"\nDone! {len(processed_files)} files processed.")
    print(f"Log saved to: {log_file_path}")


In [None]:

input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Train"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/training"
train = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/train.txt"

preprocess_csv_files(input_path, output_path, train)


In [None]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Validation"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/validating"
train = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/val.txt"

preprocess_csv_files(input_path, output_path, train)


In [None]:
input_path = "/content/drive/MyDrive/MEMO_KDD_2022/Test"
output_path = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/testing"
train = "/content/drive/MyDrive/MEMO_KDD_2022/process_sentiment/test.txt"

preprocess_csv_files(input_path, output_path, train)


CREATING DATASET

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import pandas as pd
import numpy as np

def process_csv_folder(folder_path, output_filename='train.csv'):
    final_data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)  # Normal comma-separated CSV

            # Extract summary, primary_topic, secondary_topic
            summary_row = df[df['Utterance'] == 'summary']
            primary_row = df[df['Utterance'] == 'primary_topic']
            secondary_row = df[df['Utterance'] == 'secondary_topic']

            summary = summary_row.iloc[0, 1] if not summary_row.empty else np.nan
            primary_topic = primary_row.iloc[0, 1] if not primary_row.empty else np.nan
            secondary_topic = secondary_row.iloc[0, 1] if not secondary_row.empty else np.nan

            # Drop these metadata rows
            df = df[~df['Utterance'].isin(['summary', 'primary_topic', 'secondary_topic'])]

            # Combine all utterances
            combined_utterances = ' '.join(df['Utterance'].dropna().astype(str).tolist())

            final_data.append({
                'utterances': combined_utterances,
                'summary': summary,
                'primary_topic': primary_topic,
                'secondary_topic': secondary_topic
            })

    # Create a combined DataFrame
    combined_df = pd.DataFrame(final_data)

    # Save to CSV
    output_path = os.path.join("/content/drive/MyDrive/MEMO_KDD_2022/Processed_new", output_filename)
    combined_df.to_csv(output_path, index=False)

    print(f"Combined dataset saved to: {output_path}")
    return combined_df


In [6]:
path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/training"

In [None]:
# Replace with your actual folder path
combined_df = process_csv_folder(path)


In [None]:
path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/validating"
combined_df = process_csv_folder(path, "valid.csv")

In [None]:
path = "/content/drive/MyDrive/MEMO_KDD_2022/Processed_new/testing"
combined_df = process_csv_folder(path, "test.csv")