In [41]:
import pandas as pd
import paths
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os

In [42]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [57]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [43]:
lexical_df = pd.read_csv('data/Labels/interview_transcripts_by_turkers.csv', header=None)
lexical_df.columns = ['InterviewID', 'Transcript']
lexical_df['InterviewID'] = lexical_df['InterviewID'].str.upper()
lexical_df.head()

Unnamed: 0,InterviewID,Transcript
0,P1,Interviewer: So how are you doing?|Interviewee...
1,P10,Interviewer: So how you doing?|Interviewee: G...
2,P11,Interviewer: So tell me about yourself. |Inte...
3,P12,Interviewer: So how are you doing today?|Inter...
4,P13,Interviewer: How are you doing today?|Intervie...


In [44]:
interviewee_responses = {}

# Iterate through the dataframe rows
for index, row in lexical_df.iterrows():
    transcript = row['Transcript']
    
    # Split the transcript into interview segments
    segments = transcript.split('|')
    
    # Extract interviewee responses
    interviewee_response = ""
    for segment in segments:
        if 'Interviewee:' in segment:
            interviewee_response += segment.replace('Interviewee:', '').strip() + " "
    
    # Append interviewee response to the list
    interviewee_responses[row['InterviewID']] = interviewee_response



In [45]:
interviewee_responses_df = pd.DataFrame.from_dict(interviewee_responses, orient='index')
interviewee_responses_df.columns = ['Transcript']
interviewee_responses_df.reset_index(level=0, inplace=True)
interviewee_responses_df.columns = ['InterviewID', 'Transcript']
interviewee_responses_df.head()

Unnamed: 0,InterviewID,Transcript
0,P1,Im pretty good. ok uhm so have you looked at...
1,P10,Great how about you? I'm a little [???] by th...
2,P11,Uhh I’m a junior at MIT uhh I’m double major...
3,P12,I'm good how are you? Ok so I'm a Junior at...
4,P13,Good. Ok umm I'm currently a junior at M.I.T...


In [46]:
df_labels = pd.read_csv(paths.labels_path)
df_labels = df_labels.rename(columns={'Participant': 'InterviewID'})
df_labels = df_labels.drop(columns=['Worker'])
df_labels = df_labels.groupby('InterviewID').mean().reset_index()
df_labels['InterviewID'] = df_labels['InterviewID'].str.upper()

medians = df_labels.median(numeric_only=True)

In [47]:
classify = {}
ids = df_labels['InterviewID']
for i in ids:
    if df_labels[df_labels['InterviewID'] == i]['StructuredAnswers'].values[0] >= medians['StructuredAnswers']:
        classify[i] = True
    else:
        classify[i] = False
classify_df = pd.DataFrame.from_dict(classify, orient='index')
classify_df.columns = ['StructuredAnswers']
classify_df.reset_index(level=0, inplace=True)
classify_df.columns = ['InterviewID', 'StructuredAnswers']
classify_df.head()

Unnamed: 0,InterviewID,StructuredAnswers
0,P1,True
1,P10,False
2,P11,True
3,P12,False
4,P13,True


In [48]:
# merge the two dataframes
df = pd.merge(interviewee_responses_df, classify_df, on='InterviewID')
df.head()

Unnamed: 0,InterviewID,Transcript,StructuredAnswers
0,P1,Im pretty good. ok uhm so have you looked at...,True
1,P10,Great how about you? I'm a little [???] by th...,False
2,P11,Uhh I’m a junior at MIT uhh I’m double major...,True
3,P12,I'm good how are you? Ok so I'm a Junior at...,False
4,P13,Good. Ok umm I'm currently a junior at M.I.T...,True


In [49]:
# Assuming 'df' is your DataFrame
transcripts = df['Transcript'].values
labels = df['StructuredAnswers'].values

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(transcripts)
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(transcripts)

# Pad sequences for uniform length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')


In [50]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_labels, test_size=0.2, random_state=42)

In [52]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

In [None]:
ham_msg_text = " ".join(msg for msg in df[df['StructuredAnswers'] == False].Transcript)

ham_msg_cloud = WordCloud(width =520, height =260, stopwords = STOPWORDS, max_font_size = 50, background_color = "black", colormap = 'Pastel1').generate(ham_msg_text)
plt.figure(figsize=(16,10))
plt.imshow(ham_msg_cloud, interpolation = 'bilinear')
plt.axis('off') # turn off axis
plt.show()

In [60]:
with tf.device('/cpu:0'):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10


2023-11-30 15:31:46.907604: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at xla_ops.cc:512 : INVALID_ARGUMENT: Trying to access resource dense_1/kernel/36 (defined @ /home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/base_layer_utils.py:137) located in device /job:localhost/replica:0/task:0/device:GPU:0 from device /job:localhost/replica:0/task:0/device:CPU:0
 Cf. https://www.tensorflow.org/xla/known_issues#tfvariable_on_a_different_device
2023-11-30 15:31:46.907656: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at xla_ops.cc:512 : INVALID_ARGUMENT: Trying to access resource dense_1/bias/37 (defined @ /home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/base_layer_utils.py:137) located in device /job:localhost/replica:0/task:0/device:GPU:0 from device /job:localhost/replica:0/task:0/device:CPU:0
 Cf. https://www.tensorflow.org/xla/known_issues#tfvariable_on_a_different_device


InvalidArgumentError: Graph execution error:

Detected at node Adam/StatefulPartitionedCall_4 defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/home/malhaar/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 982, in launch_instance

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start

  File "/home/malhaar/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 600, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute

  File "/home/malhaar/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/home/malhaar/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2940, in run_cell

  File "/home/malhaar/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2995, in _run_cell

  File "/home/malhaar/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/malhaar/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3194, in run_cell_async

  File "/home/malhaar/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3373, in run_ast_nodes

  File "/home/malhaar/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code

  File "/tmp/ipykernel_10992/4211684127.py", line 3, in <module>

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1807, in fit

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1154, in train_step

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 544, in minimize

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 1223, in apply_gradients

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 652, in apply_gradients

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 1253, in _internal_apply_gradients

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 1345, in _distributed_apply_gradients_fn

  File "/home/malhaar/.local/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 1340, in apply_grad_to_update_var

Trying to access resource dense_1/kernel/36 (defined @ /home/malhaar/.local/lib/python3.10/site-packages/keras/src/engine/base_layer_utils.py:137) located in device /job:localhost/replica:0/task:0/device:GPU:0 from device /job:localhost/replica:0/task:0/device:CPU:0
 Cf. https://www.tensorflow.org/xla/known_issues#tfvariable_on_a_different_device
	 [[{{node Adam/StatefulPartitionedCall_4}}]] [Op:__inference_train_function_14345]

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

In [None]:
# Assuming 'new_data' is a new set of transcripts
new_sequences = tokenizer.texts_to_sequences(new_data)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length, padding='post')

predictions = model.predict(new_padded_sequences)