In [2]:
from datasets import load_dataset, Dataset
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random
from transformers import DataCollatorWithPadding

from transformers import AutoTokenizer
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#load model
model = BertForSequenceClassification.from_pretrained("model__v1_t3.model")

labels =  ["dummy", "state", "inform", "validate", "reject", "inquire", "direct"]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [4]:
#quick prototyping using sed, TODO change to python re module
import subprocess
import os

import re

cwd = os.getcwd()

nek19_df = pd.DataFrame()

for session in range(1,9):
    f_cmd = cwd + '/data/Project_RED/Cost\ of\ Conflict/chat\ log\ data/NEK19_{}.txt'.format(session)
    f = re.sub(r"\\", "", f_cmd)
    f_out_cmd = f_cmd + '.mod'
    f_out = re.sub(r"\\", "", f_out_cmd)
    cmd = cwd + '/convert_nek.sh {} {}'.format(f_cmd, f_out_cmd)
    out = subprocess.run(cmd, shell=True, capture_output=True).stdout

    s_df = pd.read_json(f_out)
    s_df = s_df.drop(columns=['_id', 'creationDateTime', 'timeZone'])
    s_df['session'] = session
    nek19_df = pd.concat([nek19_df, s_df])
nek19_df = nek19_df.reset_index(drop=True)
nek19_chat = Dataset.from_pandas(nek19_df)

In [5]:
nek19_chat['content']

['Hello! Please confirm when the crew is online and ready to begin.',
 'Hydrogeologist: Hello Geo!\n',
 'Session start',
 'Hello! Please confirm when the crew is online and ready to begin. ',
 'Hello! Please confirm when the crew is online and ready to begin. ',
 'Hello! Please confirm when the crew is online and ready to begin. ',
 'Hello! Please confirm when the crew is online and ready to begin. ',
 'hi',
 'Greetings',
 'test',
 'Structural Geologist: Testing',
 'Hi!',
 'Hi, baby',
 'ok',
 'Hello\n',
 'O, Yeah!\n',
 'hello\n',
 'official start',
 'Hello! ',
 'Good afternoon!',
 'Hi!\n',
 'Hydrogeologist: hello!',
 'Hello, we are ready!\n',
 'Maintenance Specialist: Good afternoon!',
 'Biochemical Engineer: Hi team, any preference on where we begin?',
 'hi\n',
 'Hi!',
 'Hi!\n',
 'Hi Justin, this is Harrison',
 'Sedimentologist: is someone logging in as the drilling specialist at Sirius?\n',
 'Hello! \n',
 'Hi!\n',
 'Sydney; this is Harrison',
 'Operations Specialist: Hello, no, drill

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preproccess(samples):
    encoding = tokenizer.encode_plus(samples['content'], add_special_tokens = True,
                        max_length = 32,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True,
                        padding="max_length")
        
    samples['input_ids'] = encoding['input_ids']
    samples['token_type_ids'] = encoding['token_type_ids']
    samples['attention_mask'] = encoding['attention_mask']
    return samples

nek19_chat = nek19_chat.map(preproccess)
nek19_chat.set_format('torch')

                                                               

In [7]:
from torch import tensor


def classify(samples):
    out = model(samples['input_ids'], token_type_ids=samples['token_type_ids'], attention_mask=samples['attention_mask'])
    logits = out.logits.detach().cpu().numpy()

    samples['logits'] = logits[0]
    samples['labels_h'] = labels[logits.argmax()]
    samples['labels'] = logits.argmax()
    if(re.match("^.*lease confirm when.*$", samples['content'])):
        samples['labels'] = tensor(0)
        samples['labels_h'] = 'dummy'
    return samples
nek19_chat = nek19_chat.map(classify)

                                                             

In [8]:
nek19_chat[5]

{'content': 'Hello! Please confirm when the crew is online and ready to begin. ',
 'recipient': 'con3',
 'sender': 'vc3',
 'sentOnBehalfOf': 'vc3',
 'session': tensor(1),
 'input_ids': tensor([[  101,  7592,   999,  3531, 12210,  2043,  1996,  3626,  2003,  3784,
           1998,  3201,  2000,  4088,  1012,   102,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]]),
 'logits': tensor([-5.3299, -5.0041, -5.4921, -6.3570, -7.7601,  3.4182, -3.4893]),
 'labels_h': 'dummy',
 'labels': tensor(0)}

In [16]:
# Taken from megans code
from pathlib import Path
from string import punctuation



tran_path = Path('data/Project_RED/Cost of Conflict/transcript data')
tran_list = list(tran_path.glob('*.txt'))
transcripts = {}
for filepath in tran_list:
    name = Path(filepath).stem
    with open(filepath,'r',encoding='utf-8') as my_file:
        data = my_file.readlines()
        clean = []
        if "MAG" in name:
            for line in data:
                if line[:1].isalpha() == True and "Joy " not in line[:4]:
                    clean.append(line)
        else:
            for line in data:
                if ":" in line and line[:1].isnumeric() == False:
                    clean.append(line)
        transcripts[name] = clean

def check_trans(word_list, messages):
    all_words = {}
    counter = {} # includes actual words from conversation
    dict_counter = {} # includes words from dictionary
    
    for message in messages:
        content = message.split(" ")
        for word in content:
            word = word.strip(punctuation).lower()
            if len(word)>1:
                if word in all_words:
                    all_words[word] += 1
                elif word.isalpha() == True:
                    all_words[word] = 1
                else:
                    if word[0].isnumeric() == False:
                        for symbol in punctuation:
                            if symbol in word:
                                split_word = word.split(symbol)
                                for section in split_word:
                                    if len(section) > 1:
                                        if word in all_words:
                                            all_words[word] += 1
                                        else:
                                            if word.isalpha() == True:
                                                all_words[word] = 1
                for check in word_list:
                    find = re.match(check, word)
                    if find != None:
                        if check[-1] != "*":
                            if len(word) > find.span()[1]:
                                continue
                        else:
                            if len(word) < len(check):
                                continue
                        if word not in counter:
                            counter[word] = 1
                        else:
                            counter[word] += 1
                        if check not in dict_counter:
                            dict_counter[check] = 1
                        else:
                            dict_counter[check] += 1
    return counter, dict_counter, all_words


In [43]:
trans_df = pd.DataFrame()
session = transcripts[list(transcripts.keys())[0]]

# for session in transcripts:
df = pd.DataFrame.from_dict(session)
df = df.apply(lambda x: x.str.strip())
df

Unnamed: 0,0
0,Legend:
1,Ashley: And then //press the red button.
2,Oleg: //[UI]
3,"Ashley: Okay. Today is November 30th, 2021. It..."
4,All: //Mark.
...,...
1004,Oleg: DeChurch ten. …Test 2C? [UI]
1005,Ashley: Ten. DeChurch ten.
1006,Oleg: Yes.
1007,"Vika: During the test, can’t we be looking in ..."
