In [1]:
import pandas as pd

# Read the data

# Load the data AnnoMI-full.csv into a data frame
df = pd.read_csv('AnnoMI-full.csv')

In [5]:
# Check how many unique values video_title column has
df['video_title'].nunique()

133

In [3]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [5]:
# Create separate dataframes for the therapist's utterances and the client's utterances
therapist_df = df[df['interlocutor'] == 'therapist'][['utterance_text', 'main_therapist_behaviour']].reset_index(drop=True)
client_df = df[df['interlocutor'] == 'client'][['utterance_text', 'client_talk_type']].reset_index(drop=True)

# Shift the client's dataframe up by one row
client_df = client_df.shift(-1)

# Concatenate the therapist's and client's dataframes horizontally
paired_df = pd.concat([therapist_df, client_df], axis=1)
paired_df.columns = ['therapist_utterance', 'main_therapist_behaviour', 'client_response', 'client_talk_type']

# Remove any rows with missing values
paired_df = paired_df.dropna()

# Pair the therapist's utterance with the client's response
paired_df['paired_utterances'] = paired_df['therapist_utterance'] + ' ' + paired_df['client_response']

# Clean the paired utterances
paired_df['paired_utterances'] = paired_df['paired_utterances'].apply(clean_text)

# Now you can use 'main_therapist_behaviour' and 'client_talk_type' as labels for your models


In [5]:
paired_df.head()

Unnamed: 0,therapist_utterance,main_therapist_behaviour,client_response,client_talk_type,paired_utterances
0,Thanks for filling it out. We give this form t...,question,Mm-hmm.,neutral,thanks for filling it out we give this form to...
1,"So, let's see. It looks that you put-- You dri...",therapist_input,Usually three drinks and glasses of wine.,neutral,so lets see it looks that you put you drink al...
2,-and you usually have three to four drinks whe...,therapist_input,Something like that.,neutral,and you usually have three to four drinks when...
3,Okay. That's at least 12 drinks a week.,therapist_input,Okay.,neutral,okay thats at least 12 drinks a week okay
4,"Okay. Just so you know, my role, um, when we t...",therapist_input,"Well, I usually drink when I'm at home trying ...",neutral,okay just so you know my role um when we talk ...


In [6]:
# Print the first row of the dataframe only the paired_utterances
print(paired_df['paired_utterances'].iloc[4])

okay just so you know my role um when we talk about alcohol use is just to share information about risk and to help patients who want help this is different than telling them what i think they should do i dont do that well i usually drink when im at home trying to unwind and i drink while im watching a movie and sometimes um i take a bath but i also drink when i take a bath sometimes


In [6]:
# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the vectorizer on our paired utterances
tfidf = tfidf_vectorizer.fit_transform(paired_df['paired_utterances'])

# Convert the tfidf matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df.head()

Unnamed: 0,000003,000005,000030,000040,000057,000058,000104,000112,000113,000118,...,yup,yyeah,yyou,yyoure,yyouve,zero,zerotolerance,zombie,zone,zyban
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score

In [18]:
# Prepare the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(paired_df['main_therapist_behaviour'])  

# Define the models
models = [
    ('Logistic Regression', make_pipeline(StandardScaler(), LogisticRegression(max_iter=5000))),
    ('SVM', svm.SVC(probability=True)),
    ('Random Forest', RandomForestClassifier())
]

# Perform k-fold cross validation
kfold = KFold(n_splits=10, random_state=42, shuffle=True)
for name, model in models:
    cv_results = cross_val_score(model, tfidf_df, labels, cv=kfold, scoring='accuracy')
    print(f'{name}: {cv_results.mean()} ({cv_results.std()})')

Logistic Regression: 0.547439503290172 (0.014571995918612891)
SVM: 0.6262601270077124 (0.015634765446156145)
Random Forest: 0.6096098227552538 (0.018776950330193394)


In [19]:
# do the same as above but for 'client_talk_type'

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(paired_df['client_talk_type'])

# Define the models
models = [
    ('Logistic Regression', make_pipeline(StandardScaler(), LogisticRegression(max_iter=5000))),
    ('SVM', svm.SVC(probability=True)),
    ('Random Forest', RandomForestClassifier())
]

# Perform k-fold cross validation
kfold = KFold(n_splits=10, random_state=42, shuffle=True)
for name, model in models:
    cv_results = cross_val_score(model, tfidf_df, labels, cv=kfold, scoring='accuracy')
    print(f'{name}: {cv_results.mean()} ({cv_results.std()})')

Logistic Regression: 0.5956299087242624 (0.013001408824994216)
SVM: 0.6841196136701337 (0.016511507441948625)
Random Forest: 0.6781723183329795 (0.013352687644476608)


In [1]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

2023-06-20 12:40:08.696966: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-20 12:40:08.747881: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-20 12:40:08.749007: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(paired_df['paired_utterances'])
sequences = tokenizer.texts_to_sequences(paired_df['paired_utterances'])

# Pad sequences so that they all have the same length
max_sequence_length = max(len(s) for s in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Binarize the labels
label_binarizer = LabelBinarizer()
labels = label_binarizer.fit_transform(paired_df['main_therapist_behaviour'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dense(labels.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64)

# Predict the labels
y_pred = model.predict(X_test)

# Convert the predictions and true labels to the original format
y_pred = label_binarizer.inverse_transform(y_pred)
y_test = label_binarizer.inverse_transform(y_test)

# Print the classification report
print(classification_report(y_test, y_pred))


2023-06-20 10:32:33.099205: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-20 10:32:33.116373: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-20 10:32:33.304425: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gra

Epoch 1/20


2023-06-20 10:32:33.561172: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-20 10:32:33.562207: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-20 10:32:33.563097: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-06-20 10:32:46.184772: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-20 10:32:46.185712: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-20 10:32:46.189110: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
 1/53 [..............................] - ETA: 16s

2023-06-20 10:36:35.581004: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-20 10:36:35.582145: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-20 10:36:35.583907: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

                 precision    recall  f1-score   support

          other       0.80      0.72      0.76       560
       question       0.67      0.71      0.69       452
     reflection       0.57      0.59      0.58       413
therapist_input       0.53      0.57      0.55       256

       accuracy                           0.66      1681
      macro avg       0.64      0.65      0.64      1681
   weighted avg       0.67      0.66      0.66      1681



In [10]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(paired_df['paired_utterances'])
sequences = tokenizer.texts_to_sequences(paired_df['paired_utterances'])

# Pad sequences so that they all have the same length
max_sequence_length = max(len(s) for s in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Binarize the labels
label_binarizer = LabelBinarizer()
labels = label_binarizer.fit_transform(paired_df['client_talk_type'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dense(labels.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64)

# Predict the labels
y_pred = model.predict(X_test)

# Convert the predictions and true labels to the original format
y_pred = label_binarizer.inverse_transform(y_pred)
y_test = label_binarizer.inverse_transform(y_test)

# Print the classification report
print(classification_report(y_test, y_pred))

Epoch 1/20


2023-06-20 10:36:37.694022: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-20 10:36:37.695055: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-20 10:36:37.695897: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-06-20 10:36:50.783699: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-20 10:36:50.785515: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-20 10:36:50.786389: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
 3/53 [>.............................] - ETA: 1s 

2023-06-20 10:40:36.238053: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-20 10:40:36.239512: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-20 10:40:36.240371: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

              precision    recall  f1-score   support

      change       0.50      0.48      0.49       410
     neutral       0.74      0.76      0.75      1042
     sustain       0.44      0.41      0.43       229

    accuracy                           0.65      1681
   macro avg       0.56      0.55      0.56      1681
weighted avg       0.64      0.65      0.64      1681



In [13]:
# Binarize the labels
label_binarizer = LabelBinarizer()
labels = label_binarizer.fit_transform(paired_df['main_therapist_behaviour'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, labels, test_size=0.25, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(64, input_dim=tfidf_df.shape[1], activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64)

# Predict the labels
y_pred = model.predict(X_test)

# Convert the predictions and true labels to the original format
y_pred = label_binarizer.inverse_transform(y_pred)
y_test = label_binarizer.inverse_transform(y_test)

# Print the classification report
print(classification_report(y_test, y_pred))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [14]:
# Binarize the labels
label_binarizer = LabelBinarizer()
labels = label_binarizer.fit_transform(paired_df['client_talk_type'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, labels, test_size=0.25, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(64, input_dim=tfidf_df.shape[1], activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64)

# Predict the labels
y_pred = model.predict(X_test)

# Convert the predictions and true labels to the original format
y_pred = label_binarizer.inverse_transform(y_pred)
y_test = label_binarizer.inverse_transform(y_test)

# Print the classification report
print(classification_report(y_test, y_pred))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

**Starting With Audio**

In [27]:
# Make new sub dataframe where the topic is "reducing alcohol consumption"

reducing_alcohol_df = df[df['topic'] == 'reducing alcohol consumption']

# Checking the unique number of video in this sub dataframe

reducing_alcohol_df['video_url'].nunique()

23

In [28]:
reducing_alcohol_df.head()

Unnamed: 0,mi_quality,transcript_id,video_title,video_url,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type
0,high,0,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,
1,high,0,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral
2,high,0,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,
3,high,0,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral
4,high,0,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,


In [46]:
# Drop all the instances for the video_title Doctor Uses Motivational Interviewing to Discuss Alcohol Use
reducing_alcohol_df = reducing_alcohol_df[reducing_alcohol_df['video_title'] != 'Doctor Uses Motivational Interviewing to Discuss Alcohol Use']

In [3]:
# Converting videos to audio

import os
import subprocess

# The directory containing your video files
video_dir = 'Video_Audio/Video'

# The directory where you want to save the audio files
audio_dir = 'Video_Audio/Audio'

# List of video extensions
video_extensions = ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.webm']

# Loop over all theme directories in the video directory
for theme in os.listdir(video_dir):
    theme_video_dir = os.path.join(video_dir, theme)
    theme_audio_dir = os.path.join(audio_dir, theme)
    
    # Make sure the theme directory exists in the audio directory
    os.makedirs(theme_audio_dir, exist_ok=True)

    # Loop over all files in the theme video directory
    for filename in os.listdir(theme_video_dir):
        # Check if the file is a video
        if os.path.splitext(filename)[1] in video_extensions:
            # Construct the full file paths
            video_path = os.path.join(theme_video_dir, filename)
            audio_path = os.path.join(theme_audio_dir, os.path.splitext(filename)[0] + '.wav')
            
            # Construct the command for extracting audio
            command = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', '-y', audio_path]
            
            # Run the command
            subprocess.run(command, check=True)


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [3]:
# Test on one audio file

import os
import pandas as pd
from pydub import AudioSegment

# Function to convert 'HH:MM:SS' format to seconds
def time_to_seconds(time_str):
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s

# Filter the dataset to only include rows for a specific video
df_test = df[df['video_title'] == 'Using Open Ended Questions in Home Visiting-OARS-Motivational Interviewing']

# Reset the index of the DataFrame
df_test = df_test.reset_index(drop=True)

# Define the path to your audio file
audio_path = 'Video_Audio/Audio/taking medicine - following medical procedure/Using Open Ended Questions in Home Visiting-OARS-Motivational Interviewing.wav'

# Load the audio file
audio = AudioSegment.from_wav(audio_path)

# Create a directory for the audio segments
os.makedirs('Video_Audio/TestAudio', exist_ok=True)

# Loop over each row in the dataset
for index, row in df_test.iterrows():
    # Extract the audio segment based on the timestamps
    start_time = time_to_seconds(row['timestamp']) * 1000  # Convert to milliseconds
    if index < len(df_test) - 1:
        # If this is not the last row, the end time is the start time of the next utterance
        end_time = time_to_seconds(df_test.loc[index + 1, 'timestamp']) * 1000 # Convert to milliseconds
    else:
        # If this is the last row, the end time is the end of the audio file
        end_time = len(audio)
    segment = audio[start_time:end_time]

    # Construct the path to save the audio segment
    segment_path = os.path.join('Video_Audio/TestAudio', f'{row["interlocutor"]}_{row["video_title"]}_{index}.wav')

    # Save the audio segment
    segment.export(segment_path, format='wav')

In [35]:
import os
import pandas as pd
from pydub import AudioSegment

# Function to convert 'HH:MM:SS' format to seconds
def time_to_seconds(time_str):
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s


# Define the path to your audio directory
audio_dir = 'Video_Audio/Audio'

# Create a directory for the audio segments
os.makedirs('Video_Audio/SplitAudio', exist_ok=True)

# Loop over each subdirectory in the audio directory
for subdir in os.listdir(audio_dir):
    subdir_path = os.path.join(audio_dir, subdir)
    
    # Check if the path is a directory
    if os.path.isdir(subdir_path):
        # Loop over each file in the subdirectory
        for filename in os.listdir(subdir_path):
            # Check if the file is a .wav file
            if filename.endswith('.wav'):
                # Define the path to the audio file
                audio_path = os.path.join(subdir_path, filename)
                
                # Load the audio file
                audio = AudioSegment.from_wav(audio_path)
                
                # Filter the dataset to only include rows for this video
                df_filtered = reducing_alcohol_df[reducing_alcohol_df['video_title'] == filename[:-4]]  # Remove the file extension from the filename
                
                # Reset the index of the DataFrame
                df_filtered = df_filtered.reset_index(drop=True)
                
                # Loop over each row in the dataset
                for index, row in df_filtered.iterrows():
                    # Extract the audio segment based on the timestamps
                    start_time = time_to_seconds(row['timestamp']) * 1000  # Convert to milliseconds
                    if index < len(df_filtered) - 1:
                        # If this is not the last row, the end time is the start time of the next utterance
                        end_time = time_to_seconds(df_filtered.loc[index + 1, 'timestamp']) * 1000  # Convert to milliseconds
                    else:
                        # If this is the last row, the end time is the end of the audio file
                        end_time = len(audio)
                    segment = audio[start_time:end_time]
                    
                    # Construct the path to save the audio segment
                    segment_path = os.path.join('Video_Audio/SplitAudio', f'{row["interlocutor"]}_{row["video_title"]}_{index}.wav')
                    
                    # Save the audio segment
                    segment.export(segment_path, format='wav')


In [47]:
# Create separate dataframes for the therapist's utterances and the client's utterances
therapist_df = reducing_alcohol_df[reducing_alcohol_df['interlocutor'] == 'therapist'][['utterance_text', 'main_therapist_behaviour', 'video_title']].reset_index(drop=True)
client_df = reducing_alcohol_df[reducing_alcohol_df['interlocutor'] == 'client'][['utterance_text', 'client_talk_type']].reset_index(drop=True)

# Shift the client's dataframe up by one row
client_df = client_df.shift(-1)

# Concatenate the therapist's and client's dataframes horizontally
paired_df = pd.concat([therapist_df, client_df], axis=1)
paired_df.columns = ['therapist_utterance', 'main_therapist_behaviour', 'video_title', 'client_response', 'client_talk_type']

# Remove any rows with missing values
reducing_alcohol_paired_df = paired_df.dropna()

# Pair the therapist's utterance with the client's response
reducing_alcohol_paired_df['paired_utterances'] = reducing_alcohol_paired_df['therapist_utterance'] + ' ' + reducing_alcohol_paired_df['client_response']

# Clean the paired utterances
reducing_alcohol_paired_df['paired_utterances'] = reducing_alcohol_paired_df['paired_utterances'].apply(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reducing_alcohol_paired_df['paired_utterances'] = reducing_alcohol_paired_df['therapist_utterance'] + ' ' + reducing_alcohol_paired_df['client_response']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reducing_alcohol_paired_df['paired_utterances'] = reducing_alcohol_paired_df['paired_utterances'].apply(clean_text)


In [38]:
reducing_alcohol_paired_df.head()

Unnamed: 0,therapist_utterance,main_therapist_behaviour,video_title,client_response,client_talk_type,paired_utterances
0,Thanks for filling it out. We give this form t...,question,"NEW VIDEO: Brief intervention: ""Barbara""",Mm-hmm.,neutral,thanks for filling it out we give this form to...
1,"So, let's see. It looks that you put-- You dri...",therapist_input,"NEW VIDEO: Brief intervention: ""Barbara""",Usually three drinks and glasses of wine.,neutral,so lets see it looks that you put you drink al...
2,-and you usually have three to four drinks whe...,therapist_input,"NEW VIDEO: Brief intervention: ""Barbara""",Something like that.,neutral,and you usually have three to four drinks when...
3,Okay. That's at least 12 drinks a week.,therapist_input,"NEW VIDEO: Brief intervention: ""Barbara""",Okay.,neutral,okay thats at least 12 drinks a week okay
4,"Okay. Just so you know, my role, um, when we t...",therapist_input,"NEW VIDEO: Brief intervention: ""Barbara""","Well, I usually drink when I'm at home trying ...",neutral,okay just so you know my role um when we talk ...


In [39]:
# Save as a CSV file
reducing_alcohol_paired_df.to_csv('reduce_alcohol_paired.csv', index=False)
    

In [64]:
# Import necessary libraries
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import torchaudio
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder

# Check if CUDA is available and set PyTorch to use GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load the DataFrame
df = reducing_alcohol_paired_df

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the 'main_therapist_behaviour' column
df['main_therapist_behaviour'] = label_encoder.fit_transform(df['main_therapist_behaviour'])

class MultimodalDataset(Dataset):
    def __init__(self, df, audio_dir, tokenizer, max_audio_len=16000):
        self.df = df
        self.audio_dir = audio_dir
        self.tokenizer = tokenizer
        self.max_audio_len = max_audio_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['paired_utterances']
        audio_files = [f'therapist_{row["video_title"]}_{2*idx}.wav', 
                       f'client_{row["video_title"]}_{2*idx+1}.wav']
        waveforms = []
        for audio_file in audio_files:
            audio_path = os.path.join(self.audio_dir, audio_file)
            if os.path.exists(audio_path):
                waveform, _ = torchaudio.load(audio_path)
                if waveform.shape[1] < self.max_audio_len:
                    padding = torch.zeros((1, self.max_audio_len - waveform.shape[1]))
                    waveform = torch.cat([waveform, padding], dim=1)
                else:
                    waveform = waveform[:, :self.max_audio_len]
                waveforms.append(waveform)
            else:
                print(f"File not found: {audio_path}")
                waveforms.append(torch.zeros((1, self.max_audio_len)))  # Empty tensor as placeholder

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=200,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'waveforms': waveforms,  # This is now a list of waveforms
            'targets': torch.tensor(row['main_therapist_behaviour'], dtype=torch.long)
        }



# Define the Model
class MultimodalModel(nn.Module):
    def __init__(self):
        super(MultimodalModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(768, 256, batch_first=True)
        self.audio_transforms = torchaudio.transforms.MelSpectrogram()
        self.cnn = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.fc = nn.Linear(256 + 64, len(label_encoder.classes_))  # Update the output size to match the number of classes

    def forward(self, ids, mask, waveform):
        waveform = self.audio_transforms(waveform)
        waveform = waveform.unsqueeze(1)
        audio_features = self.cnn(waveform)
        audio_features = audio_features.view(audio_features.size(0), -1)
        sequence_output, _ = self.bert(ids, attention_mask=mask)
        text_features, _ = self.lstm(sequence_output)
        text_features = text_features[:, -1, :]
        features = torch.cat((text_features, audio_features), dim=1)
        output = self.fc(features)
        return output

Using device: cuda


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['main_therapist_behaviour'] = label_encoder.fit_transform(df['main_therapist_behaviour'])


In [65]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split the DataFrame into training and test sets based on unique video titles
video_titles = df['video_title'].unique()
train_titles, test_titles = train_test_split(video_titles, test_size=0.2, random_state=42)

train_df = df[df['video_title'].isin(train_titles)]
test_df = df[df['video_title'].isin(test_titles)]

# Create the Datasets
train_dataset = MultimodalDataset(train_df, 'Video_Audio/SplitAudio', tokenizer)
test_dataset = MultimodalDataset(test_df, 'Video_Audio/SplitAudio', tokenizer)

# Create the DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Initialize the model and optimizer
model = MultimodalModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
model.train()
criterion = nn.CrossEntropyLoss()
for epoch in range(10):
    print(f'Epoch {epoch+1}')
    for i, data in enumerate(tqdm(train_dataloader)):
        if data is None:
            continue
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        waveforms = [waveform.to(device) for waveform in data['waveforms']]  # This is now a list of waveforms
        targets = data['targets'].to(device)

        optimizer.zero_grad()
        # Concatenate the list of waveforms into a single tensor
        waveforms = torch.cat(waveforms, dim=0)
        outputs = model(ids, mask, waveforms)  # Pass the concatenated tensor to the model
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1


  0%|          | 0/41 [00:00<?, ?it/s]


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [32, 1, 1, 128, 81]

In [63]:
# Evaluate the model
model.eval()
total = 0
correct = 0
with torch.no_grad():
    for i, data in enumerate(tqdm(test_dataloader)):
        if data is None:
            continue

        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        waveform = data['waveform'].to(device)
        targets = data['targets'].to(device)

        outputs = model(ids, mask, waveform)
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f'Test Accuracy: {100 * correct / total}%')

  0%|          | 0/11 [00:00<?, ?it/s]


KeyError: 'waveform'

In [None]:
# Evaluate the model
model.eval()
total = 0
correct = 0
with torch.no_grad():
    for i, data in enumerate(tqdm(test_dataloader)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        waveform = data['waveform'].to(device)
        targets = data['targets'].to(device)

        outputs = model(ids, mask, waveform)
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f'Test Accuracy: {100 * correct / total}%')

In [None]:
# Check the rows of the video_title Doctor Uses Motivational Interviewing to Discuss Alcohol Use
reducing_alcohol_df[reducing_alcohol_df['video_title'] == 'Doctor Uses Motivational Interviewing to Discuss Alcohol Use']


Unnamed: 0,mi_quality,transcript_id,video_title,video_url,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type
458,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,0,therapist,00:00:49,So I know we've determined that you sprained y...,0,True,information,False,,True,closed,question,
459,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,0,therapist,00:00:49,So I know we've determined that you sprained y...,1,False,,False,,True,open,question,
460,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,0,therapist,00:00:49,So I know we've determined that you sprained y...,2,False,,False,,True,open,question,
461,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,0,therapist,00:00:49,So I know we've determined that you sprained y...,3,False,,False,,True,open,question,
462,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,0,therapist,00:00:49,So I know we've determined that you sprained y...,4,False,,True,simple,True,open,question,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,65,client,00:05:39,Yeah. Sounds good.,5,,,,,,,,change
1114,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,65,client,00:05:39,Yeah. Sounds good.,6,,,,,,,,neutral
1115,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,65,client,00:05:39,Yeah. Sounds good.,7,,,,,,,,neutral
1116,high,7,Doctor Uses Motivational Interviewing to Discu...,https://www.youtube.com/watch?v=_HpKn29oCD0,reducing alcohol consumption,65,client,00:05:39,Yeah. Sounds good.,8,,,,,,,,neutral
