In [1]:
from datasets import Dataset
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random
from transformers import DataCollatorWithPadding

from transformers import AutoTokenizer
import json

In [None]:
#load model
model = BertForSequenceClassification.from_pretrained("models/model_mrda_v2_t1.model/")

labels = ["statement", "disruption", "backchannel", "follow-me", "question"]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [None]:
#quick prototyping using sed, TODO change to python re module
import subprocess
import os

import re
from dateutil import parser

cwd = os.getcwd()

nek21_chat_df = pd.DataFrame()
session_numbers = list(range(1,20))
session_numbers.remove(2)
for session in session_numbers:
    f_cmd = cwd + "/data/Project_RED/Cost\ of\ Conflict/chat\ log\ data/NEK21/{:02d}.csv".format(session)
    f = re.sub(r"\\", "", f_cmd)
    f_out_cmd = f_cmd + '.mod'
    f_out = re.sub(r"\\", "", f_out_cmd)
    cmd = cwd + '/convert_nek21.sh {} {}'.format(f_cmd, f_out_cmd)
    out = subprocess.run(cmd, shell=True, capture_output=True).stdout
    s_df = pd.read_csv(f_out)
    s_df = s_df.drop(columns=['_id', 'timeZone'])
    s_df['creationDateTime'] = s_df['creationDateTime'].apply(parser.parse)
    delt = s_df['creationDateTime'].iloc[-1]  - s_df['creationDateTime'].iloc[0]
    num_blocks = 3
    def aux(time):
        v = int(((time - s_df['creationDateTime'].iloc[0])/delt) * num_blocks)
        if(v == num_blocks): v = num_blocks-1

        return v
    s_df['time'] = s_df['creationDateTime'].map(aux)
    s_df['session'] = session
    nek21_chat_df = pd.concat([nek21_chat_df, s_df])
nek21_chat_df = nek21_chat_df.drop('creationDateTime',axis=1).reset_index(drop=True)
nek21_chat = Dataset.from_pandas(nek21_chat_df)

In [None]:
nek21_chat['content']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preproccess(samples):
    encoding = tokenizer.encode_plus(samples['content'], add_special_tokens = True,
                        max_length = 32,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True,
                        padding="max_length")
        
    samples['input_ids'] = encoding['input_ids']
    samples['token_type_ids'] = encoding['token_type_ids']
    samples['attention_mask'] = encoding['attention_mask']
    return samples

nek21_chat = nek21_chat.map(preproccess)
nek21_chat.set_format('torch')

In [None]:
from torch import tensor


def classify(samples):
    out = model(samples['input_ids'], token_type_ids=samples['token_type_ids'], attention_mask=samples['attention_mask'])
    logits = out.logits.detach().cpu().numpy()

    samples['logits'] = logits[0]
    # samples['labels_h'] = labels[logits.argmax()]
    samples['labels_h'] = labels[logits.argmax()]
    samples['labels'] = logits.argmax()
    if(re.match("^.*lease confirm when.*$", samples['content'])):
        samples['labels'] = tensor(0)
        samples['labels_h'] = 'dummy'
    return samples
nek21_chat = nek21_chat.map(classify)

In [None]:
nek21_chat[5]

In [None]:
# Taken from megans code
from pathlib import Path
from string import punctuation


tran_path = Path('data/Project_RED/Cost of Conflict/transcript data/NEK21/')
tran_list = list(tran_path.glob('*.txt'))
transcripts = {}
time = 0
for filepath in tran_list:
    name = Path(filepath).stem
    with open(filepath,'r',encoding='utf-8') as my_file:
        data = my_file.readlines()
        clean = []
        if "MAG" in name:
            for line in data:
                if line[:1].isalpha() == True and "Joy " not in line[:4]:
                    clean.append(line)
        else:
            for line in data:
                if ":" in line and line[:1].isnumeric() == False:
                    clean.append(line.strip() + "&&" +  str(time))
                elif ":" in line and line[:1].isnumeric() == True:
                    time +=1
        transcripts[name] = clean

def check_trans(word_list, messages):
    all_words = {}
    counter = {} # includes actual words from conversation
    dict_counter = {} # includes words from dictionary
    
    for message in messages:
        content = message.split(" ")
        for word in content:
            word = word.strip(punctuation).lower()
            if len(word)>1:
                if word in all_words:
                    all_words[word] += 1
                elif word.isalpha() == True:
                    all_words[word] = 1
                else:
                    if word[0].isnumeric() == False:
                        for symbol in punctuation:
                            if symbol in word:
                                split_word = word.split(symbol)
                                for section in split_word:
                                    if len(section) > 1:
                                        if word in all_words:
                                            all_words[word] += 1
                                        else:
                                            if word.isalpha() == True:
                                                all_words[word] = 1
                for check in word_list:
                    find = re.match(check, word)
                    if find != None:
                        if check[-1] != "*":
                            if len(word) > find.span()[1]:
                                continue
                        else:
                            if len(word) < len(check):
                                continue
                        if word not in counter:
                            counter[word] = 1
                        else:
                            counter[word] += 1
                        if check not in dict_counter:
                            dict_counter[check] = 1
                        else:
                            dict_counter[check] += 1
    return counter, dict_counter, all_words

# session = transcripts[list(transcripts.keys())[0]]
last_time = 0
nek21_trans_df = pd.DataFrame()
for session in transcripts:
   s_df = pd.DataFrame.from_dict(transcripts[session])
   def trans_proccess(c): 

      global g_time
      x=c[0].strip()
      # print(x)
      try:
         speaker = re.search(r"^[^:]*:\s*", x).group()[:-2]
      except:
         speaker = ""
      try:
         content = re.search(r":(.*)&&", x).group()[1:-2]
      except:
         content = ""
      try:
         time = int(re.search(r"&&.*", x).group()[2:])
         g_time = time
      except:
         time = -1

      if(speaker == "" and content == ""):
         speaker = None
         content = None

      row = pd.Series()

      row['speaker'] = speaker
      row['content'] = content
      row['block'] = time-last_time 
      row['session'] = session
      return row
   nek21_trans_df = pd.concat([nek21_trans_df, s_df.apply(lambda x: trans_proccess(x), axis=1)])
   last_time = g_time
   # print(str(session) + ": " + str(last_time))
nek21_trans_df = nek21_trans_df.dropna()
nek21_trans_df = nek21_trans_df[nek21_trans_df['block'] >= 0].reset_index(drop=True)
import math
nek21_trans_df['block'] = (nek21_trans_df['block'] / nek21_trans_df['block'].max() * 10).apply(math.floor)
nek21_trans_df['session'] = nek21_trans_df['session'].astype('int')
nek21_trans_df

In [None]:
# Taken from megans code
from pathlib import Path
from string import punctuation


tran_path_mag = Path('data/Project_RED/Cost of Conflict/transcript data/NEK21_MAG/')
tran_list_mag = list(tran_path.glob('*.txt'))
transcripts_mag = {}
time = 0
for filepath in tran_list_mag:
    name = Path(filepath).stem
    with open(filepath,'r',encoding='utf-8') as my_file:
        data = my_file.readlines()
        clean = []
        for line in data:
            if ":" in line and line[:1].isnumeric() == False:
                clean.append(line.strip() + "&&" + str(time))
            elif ":" in line and line[:1].isnumeric() == True:
                time += 1
        transcripts_mag[name] = clean

def check_trans(word_list, messages):
    all_words = {}
    counter = {} # includes actual words from conversation
    dict_counter = {} # includes words from dictionary
    
    for message in messages:
        content = message.split(" ")
        for word in content:
            word = word.strip(punctuation).lower()
            if len(word)>1:
                if word in all_words:
                    all_words[word] += 1
                elif word.isalpha() == True:
                    all_words[word] = 1
                else:
                    if word[0].isnumeric() == False:
                        for symbol in punctuation:
                            if symbol in word:
                                split_word = word.split(symbol)
                                for section in split_word:
                                    if len(section) > 1:
                                        if word in all_words:
                                            all_words[word] += 1
                                        else:
                                            if word.isalpha() == True:
                                                all_words[word] = 1
                for check in word_list:
                    find = re.match(check, word)
                    if find != None:
                        if check[-1] != "*":
                            if len(word) > find.span()[1]:
                                continue
                        else:
                            if len(word) < len(check):
                                continue
                        if word not in counter:
                            counter[word] = 1
                        else:
                            counter[word] += 1
                        if check not in dict_counter:
                            dict_counter[check] = 1
                        else:
                            dict_counter[check] += 1
    return counter, dict_counter, all_words

session_mag = transcripts_mag[list(transcripts_mag.keys())[0]]
last_time = 0
mag21_trans_df = pd.DataFrame()
for session_mag in transcripts_mag:
   s_df = pd.DataFrame.from_dict(transcripts_mag[session_mag])
   def trans_proccess(c): 

      global g_time
      x=c[0].strip()
      # print(x)
      try:
         speaker = re.search(r"^[^:]*:\s*", x).group()[:-2]
      except:
         speaker = ""
      try:
         content = re.search(r":(.*)&&", x).group()[1:-2]
      except:
         content = ""
      try:
         time = int(re.search(r"&&.*", x).group()[2:])
         g_time = time
      except:
         time = -1

      if(speaker == "" and content == ""):
         speaker = None
         content = None

      row = pd.Series()

      row['speaker'] = speaker
      row['content'] = content
      row['block'] = time-last_time 
      row['session'] = session
      return row
   mag21_trans_df = pd.concat([mag21_trans_df, s_df.apply(lambda x: trans_proccess(x), axis=1)])
   last_time = g_time
   # print(str(session) + ": " + str(last_time))
# def strip_mag(s):
#     return re.sub("[^0-9]", "", s)
mag21_trans_df['session'] = mag21_trans_df['session'].astype(int)
mag21_trans_df = mag21_trans_df.dropna()
mag21_trans_df = mag21_trans_df[mag21_trans_df['block'] >= 0].reset_index(drop=True)
import math
mag21_trans_df['block'] = (mag21_trans_df['block'] / mag21_trans_df['block'].max() * 10).apply(math.floor)
mag21_trans_df

In [None]:
type(mag21_trans_df['session'][4])

In [None]:

# mag21_trans_df

In [None]:
# categorize

for session in transcripts:
    s = nek21_trans_df[nek21_trans_df['session'] == session]
    delt = s['block'].max()
    s['time'] = 2
    s.loc[s['block'] < 2*delt/3, 'time']= 1
    s.loc[s['block'] < delt/3, 'time']= 0
    nek21_trans_df.loc[nek21_trans_df['session'] == session,'time'] = s['time']
# nek21_trans_df =nek21_trans_df.drop('block', axis=1)

for session_mag in transcripts_mag:
    s = mag21_trans_df[mag21_trans_df['session'] == session_mag]
    delt = s['block'].max()
    mag21_trans_df.loc[mag21_trans_df['session'] == session_mag,'time'] = s['time']

In [None]:
mag21_trans_df['time'] = ((mag21_trans_df['block'] / 11) * 3).apply(math.floor)
nek21_trans_df['time'] = ((nek21_trans_df['block'] / 11) * 3).apply(math.floor)
mag21_trans_df[mag21_trans_df['block'] == 10]

In [None]:
nek21_trans = Dataset.from_pandas(nek21_trans_df)
mag21_trans = Dataset.from_pandas(mag21_trans_df)

In [None]:
nek21_trans = nek21_trans.map(preproccess)
nek21_trans.set_format('torch')

nek21_trans = nek21_trans.map(classify)

In [None]:
mag21_trans = mag21_trans.map(preproccess)
mag21_trans.set_format('torch')

mag21_trans = mag21_trans.map(classify)

In [None]:
mag21_trans

In [None]:
nek21_trans[3008]

In [None]:
# Check accuracy by taking a random sample of 20 (10 translated, 10 msgs)

import random 
from operator import itemgetter

n = 10
trans_sample_ind = random.sample(range(0, len(nek21_trans)), n)
chat_sample_ind = random.sample(range(0, len(nek21_chat)), n)
samples = pd.DataFrame(columns=['content', 'model_label', 'human_label', 'source'])
# pd.concat([
#     pd.DataFrame.from_dict(nek21_trans[trans_sample]),
#     pd.DataFrame.from_dict(nek21_chat[chat_sample])
#                  ])

trans_sample = itemgetter(*trans_sample_ind)(nek21_trans)
chat_sample = itemgetter(*chat_sample_ind)(nek21_chat)
samples = pd.DataFrame(columns=["source", "content", "label"])

for t in trans_sample:
    samples.loc[len(samples.index)] = ["spoken", t['content'], t['labels_h']]

for c in chat_sample:
    samples.loc[len(samples.index)] = ["chat", c['content'], c['labels_h']]

print("Unique Labels in Set: " + str(len(pd.unique(samples['label']))))
samples.to_csv('samples.csv')
samples

In [None]:
nek21_trans_df = nek21_trans.to_pandas()
for i in range(0, len(labels)):
    content = "__NONE__"
    df = nek21_trans_df[nek21_trans_df['labels'] == i]['content']
    if(len(df.index) > 0):
        rand_sample_ind = random.randint(0, len(df.index)-1)
        if(rand_sample_ind >= 0):
            content = df.iloc[random.randint(0, len(df.index)-1)]

    print_str = labels[i] + ": " + str(len(df.index)) + " : " + content
    print(print_str)


In [None]:
nek21_trans_df = nek21_trans.to_pandas()
for i in range(0, len(labels)):
    content = "__NONE__"
    df = nek21_trans_df[nek21_trans_df['labels'] == i]['content']
    if(len(df.index) > 0):
        rand_sample_ind = random.randint(0, len(df.index)-1)
        if(rand_sample_ind >= 0):
            content = df.iloc[random.randint(0, len(df.index)-1)]

    print_str = labels[i] + ": " + str(len(df.index)) + " : " + content
    print(print_str)


In [None]:
# TODO NEK21
# perf_df = pd.read_csv('data/Project_RED/calculated performance data/NEKMTSCalcs.csv')
# perf_df

In [None]:
nek21_trans_df[nek21_trans_df['block']==5]

In [None]:
import matplotlib.ticker as mtick
import numpy as np
import matplotlib.pyplot as plt


n = len(transcripts.keys())
fig, axs = plt.subplots(5,4)

fig.set_size_inches(24,20)
fig.suptitle("Transcripts")
session = 1
for ax in axs.flat:
    ax.set(xlabel='Time', ylabel='Count', title="NEK21 Session: " + str(session))
    ax.axes.xaxis.set_ticklabels([])
    df = nek21_trans_df[nek21_trans_df['session'] == (session)]
    bs = range(0, len(df['block'].unique()))
    distss = []
    for i in range(0, len(labels)):
        #each block is approx <5 mins
        dists = []
        for b in bs:
            within_block = df[df['block'] == b]
            # print(within_block)
            dists = dists + [len(within_block[within_block['labels'] == i].index)]
        distss = distss + [dists]

    ax.stackplot(bs, distss)
    session += 1
fig.legend(labels)
fig.show()

In [None]:
import matplotlib.ticker as mtick
import numpy as np
import matplotlib.pyplot as plt

nek21_chat_df=nek21_chat.to_pandas()
nek21_chat_df['block'] = nek21_chat_df['time']

n = len(transcripts.keys())
fig, axs = plt.subplots(5,4)

fig.set_size_inches(24,20)
fig.suptitle("Chat")
session = 1
for ax in axs.flat:
    ax.set(xlabel='Time', ylabel='Count', title="NEK21 Session: " + str(session))
    ax.axes.xaxis.set_ticklabels([])
    df = nek21_chat_df[nek21_chat_df['session'] == (session)]
    bs = range(0, len(df['block'].unique()))
    distss = []
    for i in range(0, len(labels)):
        #each block is approx <5 mins
        dists = []
        for b in bs:
            within_block = df[df['block'] == b]
            dists = dists + [len(within_block[within_block['labels'] == i].index)]
        distss = distss + [dists]

    ax.stackplot(bs, distss)
    session += 1
fig.legend(labels)
fig.show()

In [None]:
import matplotlib as mpl
from labellines import labelLine, labelLines
# fig, axs = plt.subplots(3,2)
i=0
fig = plt.figure()
fig.set_size_inches(24,10)
fig.suptitle("Transcripts")
spec = mpl.gridspec.GridSpec(ncols=3, nrows=2)
bs = range(0, len(nek21_trans_df['block'].unique()))
def perf_gradient(score):
    return np.array([1, 0, 0]) * (1-score) + np.array([0, 1, 0]) * score
axs = [
    fig.add_subplot(spec[0, 0]),
    fig.add_subplot(spec[0, 1]),
    fig.add_subplot(spec[0, 2]),
    fig.add_subplot(spec[1, 0]),
    fig.add_subplot(spec[1, 1]),
]
for ax in axs:
    for session in range(1, n+1):
        session_df = nek21_trans_df[nek21_trans_df['session'] == session]
        ax.set(xlabel='Time', ylabel='Count', title=(labels[i]))
        ax.axes.xaxis.set_ticklabels([])
        matched_label = nek21_trans_df[nek21_trans_df['labels'] == i]
        df = matched_label[matched_label['session'] == (session)]
        dist = []
        for b in bs:
            within_block = df[df['block'] == b]
            dist = dist + [len(within_block.index)]
        # ax.plot(bs, dist, label=str(session), color=perf_gradient(perf_df['MTSPerf'].iloc[session-1]))
        ax.plot(bs, dist, label=str(session))
    i+=1
    labelLines(ax.get_lines(), yoffsets=0.00, align=False, zorder=2)

In [None]:

import matplotlib as mpl
from labellines import labelLine, labelLines
# fig, axs = plt.subplots(3,2)
i=0
fig = plt.figure()
fig.set_size_inches(24,10)
fig.suptitle("Chats")
spec = mpl.gridspec.GridSpec(ncols=3, nrows=2)
bs = range(0, len(nek21_chat_df['block'].unique()))
def perf_gradient(score):
    return np.array([1, 0, 0]) * (1-score) + np.array([0, 1, 0]) * score
axs = [
    fig.add_subplot(spec[0, 0]),
    fig.add_subplot(spec[0, 1]),
    fig.add_subplot(spec[0, 2]),
    fig.add_subplot(spec[1, 0]),
    fig.add_subplot(spec[1, 1]),
]
for ax in axs:
    for session in range(1, n+1):
        session_df = nek21_chat_df[nek21_chat_df['session'] == session]
        ax.set(xlabel='Time', ylabel='Count', title=(labels[i]))
        ax.axes.xaxis.set_ticklabels([])
        matched_label = nek21_chat_df[nek21_chat_df['labels'] == i]
        df = matched_label[matched_label['session'] == (session)]
        dist = []
        for b in bs:
            within_block = df[df['block'] == b]
            dist = dist + [len(within_block.index)]
        # ax.plot(bs, dist, label=str(session), color=perf_gradient(perf_df['MTSPerf'].iloc[session-1]))
        ax.plot(bs, dist, label=str(session))
    i+=1
    labelLines(ax.get_lines(), yoffsets=0.00, align=False, zorder=2)

In [None]:

import matplotlib as mpl
from labellines import labelLine, labelLines
# fig, axs = plt.subplots(3,2)
i=0
fig, axs = plt.subplots(5,4)
fig.set_size_inches(24,10)
fig.suptitle("Transcripts")
spec = mpl.gridspec.GridSpec(ncols=5, nrows=4)
bs = range(0, len(nek21_trans_df['block'].unique()))
def perf_gradient(score):
    return np.array([1, 0, 0]) * (1-score) + np.array([0, 1, 0]) * score
for z in range(0,4):
    i=0
    for y in range(0,5):
        ax = axs[y][z]
        for session in range(1+4*z, 1+4*(z+1)):
            session_df = nek21_trans_df[nek21_trans_df['session'] == session]
            ax.set(xlabel='Time', ylabel='Count', title=(labels[i]))
            ax.axes.xaxis.set_ticklabels([])
            matched_label = nek21_trans_df[nek21_trans_df['labels'] == i]
            df = matched_label[matched_label['session'] == (session)]
            dist = []
            for b in bs:
                within_block = df[df['block'] == b]
                dist = dist + [len(within_block.index)]
            # ax.plot(bs, dist, label=str(session), color=perf_gradient(perf_df['MTSPerf'].iloc[session-1]))
            ax.plot(bs, dist, label=str(session))
        i+=1
        labelLines(ax.get_lines(), yoffsets=0.00, align=False, zorder=2)

In [None]:

import matplotlib as mpl
from labellines import labelLine, labelLines
# fig, axs = plt.subplots(3,2)
i=0
fig, axs = plt.subplots(5,4)
fig.set_size_inches(24,10)
fig.suptitle("Chats")
spec = mpl.gridspec.GridSpec(ncols=5, nrows=4)
bs = range(0, len(nek21_chat_df['block'].unique()))
def perf_gradient(score):
    return np.array([1, 0, 0]) * (1-score) + np.array([0, 1, 0]) * score
for z in range(0,4):
    i=0
    for y in range(0,5):
        ax = axs[y][z]
        for session in range(1+4*z, 1+4*(z+1)):
            session_df = nek21_chat_df[nek21_chat_df['session'] == session]
            ax.set(xlabel='Time', ylabel='Count', title=(labels[i]))
            ax.axes.xaxis.set_ticklabels([])
            matched_label = nek21_chat_df[nek21_chat_df['labels'] == i]
            df = matched_label[matched_label['session'] == (session)]
            dist = []
            for b in bs:
                within_block = df[df['block'] == b]
                dist = dist + [len(within_block.index)]
            # ax.plot(bs, dist, label=str(session), color=perf_gradient(perf_df['MTSPerf'].iloc[session-1]))
            ax.plot(bs, dist, label=str(session))
        i+=1
        labelLines(ax.get_lines(), yoffsets=0.00, align=False, zorder=2)

In [None]:
nek21_trans_df[nek21_trans_df['labels'] == (3)].iloc[5]

In [None]:
# export results to csv
export_df1 = nek21_trans_df[['speaker', 'content', 'block', 'session', 'labels', 'labels_h']]
export_df1.to_csv('results/nek21_trans_df.csv')
export_df2 = nek21_chat_df[['sender','recipient', 'content', 'block', 'session', 'labels', 'labels_h']]
export_df2.to_csv('results/nek21_chat_df.csv')
export_df3 = mag21_trans_df[['speaker', 'content', 'block', 'session', 'labels', 'labels_h']]
export_df3.to_csv('results/mag21_trans_df.csv')