In [None]:
!pip install transformers \sentencepiece

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
path_to_dataset_str = '/content/drive/My Drive/MEMO_KDD_2022/'
path_to_train_str = '/content/drive/My Drive/MEMO_KDD_2022/Train/'
path_to_validation_str = '/content/drive/My Drive/MEMO_KDD_2022/Validation/'
path_to_test_str = '/content/drive/My Drive/MEMO_KDD_2022/Test/'

In [16]:
import os

path_to_dataset = os.path.abspath(path_to_dataset_str)
path_to_train = os.path.abspath(path_to_train_str)
path_to_validation = os.path.abspath(path_to_validation_str)
path_to_test = os.path.abspath(path_to_test_str)

In [None]:
def assign_counseling_component(sub_topic):
  if sub_topic == "symp/reasoning":
    return "SH"  # Symptom & History
  elif sub_topic == "routine":
    return "RT"  # Reflecting
  elif sub_topic == "inactive":
    return "DF"  # Discussion Filler
  else:
    return "PD"  # Default to Patient Discovery if unknown


def create_dataset(input_path, output_path):
  final_data = []

  for filename in os.listdir(input_path):
    if filename.endswith(".csv"):
      # print(f"Processing file: {filename}")
      file_path = os.path.join(input_path, filename)
      df = pd.read_csv(file_path)

      # extracting targeted set, primary and secondary topic
      summary_text, primary_topic, secondary_topic = None, None, None
      for i in range(1, 4): 
        if "summary" in str(df.iloc[-i, 0]).lower():
          summary_text = df.iloc[-i, 1]
        elif "primary topic" in str(df.iloc[-i, 0]).lower():
          primary_topic = df.iloc[-i, 1]
        elif "secondary topic" in str(df.iloc[-i, 0]).lower():
          secondary_topic = df.iloc[-i, 1]
      # print(summary_text)
      # print(primary_topic)
      # print(secondary_topic)

      # Remove last three rows
      df_cleaned = df.iloc[:-3]
      df_cleaned = df_cleaned.reset_index(drop=True)
      df_cleaned = df_cleaned[["Utterance", "Sub topic"]]
      # print(df_cleaned)

      df_cleaned["Counseling_Component"] = df_cleaned["Sub topic"].apply(assign_counseling_component)

      # Combine utterances into a single conversation text
      full_conversation = " ".join(df_cleaned["Utterance"].astype(str).fillna("").tolist())

      t5_input = f"Summarize: {full_conversation}"
      t5_target = summary_text 
      # print(t5_input)
      # print(t5_target)

      final_data.append({"Index": len(final_data), "Input": t5_input, "Output": t5_target})
  df_final = pd.DataFrame(final_data)
  df_final.to_csv(output_path, index=False)
  print(f"Processed and saved all data to: {output_path}")



In [18]:
input_folder = path_to_train
output_file = "/content/drive/My Drive/MEMO_KDD_2022/processed_dataset.csv"
create_dataset(input_folder, output_file)

Processed and saved all data to: /content/drive/My Drive/MEMO_KDD_2022/processed_dataset.csv


In [19]:
input_folder = path_to_validation
validation_file = "/content/drive/My Drive/MEMO_KDD_2022/validation_dataset.csv"
create_dataset(input_folder, validation_file)

Processed and saved all data to: /content/drive/My Drive/MEMO_KDD_2022/validation_dataset.csv


In [20]:
input_folder = path_to_test
test_file = "/content/drive/My Drive/MEMO_KDD_2022/test_dataset.csv"
create_dataset(input_folder, test_file)

Processed and saved all data to: /content/drive/My Drive/MEMO_KDD_2022/test_dataset.csv


In [21]:
train_file = "/content/drive/My Drive/MEMO_KDD_2022/processed_dataset.csv"
validation_file = "/content/drive/My Drive/MEMO_KDD_2022/validation_dataset.csv"
test_file = "/content/drive/My Drive/MEMO_KDD_2022/test_dataset.csv"


In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("Using GPU:", torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")
  print("Using CPU")


In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

In [22]:
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(validation_file)
test_df = pd.read_csv(test_file)

In [40]:
train_df

Unnamed: 0,Index,Input,Output
0,0,"Summarize: Well, I could see these types of de...",The patient is involved in making decisions th...
1,1,"Summarize: Hi, Miss Watson, see you again. See...",The blood work of patient after checkup is dis...
2,2,"Summarize: Hi, Hazel Hey doing today. Okay. Ok...",The patient is in school for teaching and they...
3,3,"Summarize: Hello Kathy, how are you today? Oka...",The patient's goal is to lose weight by about ...
4,4,"Summarize: people your age, it's very differen...",The patient was discriminated based on their r...


In [None]:
# Replace NaN values with empty strings
train_df.fillna("", inplace=True)
val_df.fillna("", inplace=True)
test_df.fillna("", inplace=True)

# Verify there are no NaN values left
print(train_df.isnull().sum())
print(val_df.isnull().sum())
print(test_df.isnull().sum())


In [24]:
# Ensure all inputs & outputs are strings
train_df["Input"] = train_df["Input"].astype(str)
train_df["Output"] = train_df["Output"].astype(str)

val_df["Input"] = val_df["Input"].astype(str)
val_df["Output"] = val_df["Output"].astype(str)

test_df["Input"] = test_df["Input"].astype(str)
test_df["Output"] = test_df["Output"].astype(str)


In [25]:
# Save preprocessed data
train_df.to_csv("/content/drive/My Drive/MEMO_KDD_2022/train_dataset_cleaned.csv", index=False)
val_df.to_csv("/content/drive/My Drive/MEMO_KDD_2022/validation_dataset_cleaned.csv", index=False)
test_df.to_csv("/content/drive/My Drive/MEMO_KDD_2022/test_dataset_cleaned.csv", index=False)


**Testing**

**T5**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = test_df

In [6]:
# Load the cleaned datasets
train_path = "/content/drive/My Drive/MEMO_KDD_2022/train_dataset_cleaned.csv"
val_path = "/content/drive/My Drive/MEMO_KDD_2022/validation_dataset_cleaned.csv"
test_path = "/content/drive/My Drive/MEMO_KDD_2022/test_dataset_cleaned.csv"

# Load into Pandas DataFrame
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-large" 
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_summary(text):
    input_text = "Summarize: " + text
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).input_ids
    outputs = model.generate(input_ids, max_length=150, num_beams=5, length_penalty=1.0, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

df["Generated_Summary"] = df["Input"].apply(generate_summary)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [24]:
df.to_csv("/content/drive/My Drive/MEMO_KDD_2022/T5_large_result.csv", index=False)

In [26]:
df

Unnamed: 0,Index,Input,Output,Generated_Summary,BLEU_Score,BERTScore_F1
0,0,Summarize: You okay now kind of taking a look ...,The patient loves their job and has been very ...,you're afraid of making a mistake. You're afra...,0.04642162,0.183525
1,1,Summarize: Are you doing today? Good. How you ...,The patient has had to give weekly presentatio...,"?. I'm, I'm, I'm, I'm, I'm, I'm, I'm, I'm, I'm...",0.002173574,0.023282
2,2,Summarize: How are you? Not well? No what's go...,The patient has been at their job for 20 years...,? Summarize: How are you? Summarize: How are y...,0.003648984,0.030062
3,3,"Summarize: So, alright, let's take a step back...",The patient has been concerned about the work ...,let's take a step back and talk about competen...,0.0399977,0.035159
4,4,Summarize: Tell me about sure for any spiritua...,The patient was raised Catholic and they feel ...,you should? Summarize: Summarize: Summarize: S...,0.04779535,0.071528
5,5,"Summarize: Okay, I understand you've been havi...",The patient has been sleeping excessively and ...,I'm sorry. to ask you a few questions. Have yo...,0.004128046,-0.060781
6,6,Summarize: Afternoon. I'm glad you could get i...,The patient feels anxious. The patient hasn't ...,I'm glad you could get in today. I'm glad you ...,0.02199415,0.018165
7,7,"Summarize: In one day, and none of them worked...",The patient is frustrated that they are not ab...,"I went to three of these programs, and they we...",0.04810602,0.159621
8,8,Summarize: Hi You How you doing today. Good. H...,The patient's brother expressed concerns about...,. of. And I have of. of. I'm doing well. Good....,0.01679694,0.083731
9,9,Summarize: Alisa How you doing today? Good? Ho...,The patient has been experiencing anxiety. The...,"? I mean, I guess I can see it in that way tha...",0.002490281,0.038211


In [None]:

def compute_sentence_bleu(reference, hypothesis):
    reference_tokens = [reference.split()]
    hypothesis_tokens = hypothesis.split()
    smoothing = SmoothingFunction().method4 
    weights = (0.25, 0.25, 0.25, 0.25) 
    return sentence_bleu(reference_tokens, hypothesis_tokens, weights=weights, smoothing_function=smoothing)

# BLEU scores
df["BLEU_Score"] = df.apply(lambda row: compute_sentence_bleu(row["Output"], row["Generated_Summary"]), axis=1)

references = [[ref.split()] for ref in df["Output"].tolist()]
hypotheses = [hyp.split() for hyp in df["Generated_Summary"].tolist()]

final_bleu = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method4)

# Compute BERTScore
P, R, F1 = score(df["Generated_Summary"].tolist(), df["Output"].tolist(), lang="en", rescale_with_baseline=True)

df["BERTScore_F1"] = F1.tolist()

final_bert = F1.mean().item()

df.to_csv("/content/drive/My Drive/MEMO_KDD_2022/T5_large_eval.csv", index=False)

print(f"Final Corpus BLEU Score: {final_bleu:.4f}")
print(f"Final BERTScore F1: {final_bert:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final Corpus BLEU Score: 0.0008
Final BERTScore F1: -0.2358


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from bert_score import score
import pandas as pd
import numpy as np

nltk.download("punkt")

df = pd.read_csv("/content/drive/My Drive/MEMO_KDD_2022/T5_large_result.csv")

def compute_sentence_bleu(reference, hypothesis):
    reference_tokens = [nltk.word_tokenize(reference)]  
    hypothesis_tokens = nltk.word_tokenize(hypothesis)  
    smoothing = SmoothingFunction().method4
    return sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothing)

# BLEU scores
df["BLEU_Score"] = df.apply(lambda row: compute_sentence_bleu(row["Output"], row["Generated_Summary"]), axis=1)

df["BLEU_Score"] = (df["BLEU_Score"] - df["BLEU_Score"].min()) / (df["BLEU_Score"].max() - df["BLEU_Score"].min())

references = [[nltk.word_tokenize(ref)] for ref in df["Output"].tolist()]
hypotheses = [nltk.word_tokenize(hyp) for hyp in df["Generated_Summary"].tolist()]
final_bleu = corpus_bleu(references, hypotheses, smoothing_function=SmoothingFunction().method4)

final_bleu = (final_bleu - 0) / (1 - 0)  # Already between 0 and 1

# BERT Score
P, R, F1 = score(df["Generated_Summary"].astype(str).tolist(), df["Output"].astype(str).tolist(), lang="en", rescale_with_baseline=True)

df["BERTScore_F1"] = (F1 - np.min(F1.numpy())) / (np.max(F1.numpy()) - np.min(F1.numpy()))
final_bert = df["BERTScore_F1"].mean()

df.to_csv("/content/drive/My Drive/MEMO_KDD_2022/evaluation_results_t5_large.csv", index=False)

print(f"Final Corpus BLEU Score (Scaled): {final_bleu:.4f}")
print(f"Final BERTScore F1 (Scaled): {final_bert:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final Corpus BLEU Score (Scaled): 0.0029
Final BERTScore F1 (Scaled): 0.5430


In [None]:
import pandas as pd
import nltk
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from bert_score import score

nltk.download("punkt")

df = test_df  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import torch

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


def generate_summary(text):
    input_text = "summarize: " + text 
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        summary_ids = model.generate(**inputs, max_length=100, num_beams=5, length_penalty=2.0)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

df["Generated_Summary"] = df["Input"].apply(generate_summary)

df.to_csv("t5_small_summarized_dataset.csv", index=False)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Summarization complete! Saved to t5_small_summarized_dataset.csv


In [32]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import nltk
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from bert_score import score

nltk.download("punkt")

df = pd.read_csv("t5_small_summarized_dataset.csv")

def compute_sentence_bleu(reference, hypothesis):
    reference_tokens = [nltk.word_tokenize(reference)]  
    hypothesis_tokens = nltk.word_tokenize(hypothesis)  
    smoothing = SmoothingFunction().method4
    return sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothing)

# BLEU scores
df["BLEU_Score"] = df.apply(lambda row: compute_sentence_bleu(row["Output"], row["Generated_Summary"]), axis=1)

df["BLEU_Score"] = (df["BLEU_Score"] - df["BLEU_Score"].min()) / (df["BLEU_Score"].max() - df["BLEU_Score"].min())

references = [[nltk.word_tokenize(ref)] for ref in df["Output"].tolist()]
hypotheses = [nltk.word_tokenize(hyp) for hyp in df["Generated_Summary"].tolist()]
final_bleu = corpus_bleu(references, hypotheses, smoothing_function=SmoothingFunction().method4)

final_bleu = (final_bleu - 0) / (1 - 0)  # Already between 0 and 1

# BERTScore
P, R, F1 = score(df["Generated_Summary"].astype(str).tolist(), df["Output"].astype(str).tolist(), lang="en", rescale_with_baseline=True)

df["BERTScore_F1"] = (F1 - np.min(F1.numpy())) / (np.max(F1.numpy()) - np.min(F1.numpy()))
final_bert = df["BERTScore_F1"].mean()

df.to_csv("evaluation_results_t5_small.csv", index=False)

print(f"Final Corpus BLEU Score (Scaled): {final_bleu:.4f}")
print(f"Final BERTScore F1 (Scaled): {final_bert:.4f}")
print("Evaluation complete! Scores saved in evaluation_results_t5_small.csv")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final Corpus BLEU Score (Scaled): 0.0184
Final BERTScore F1 (Scaled): 0.3388
Evaluation complete! Scores saved in evaluation_results_t5_small.csv
