In [1]:
from datasets import Dataset
import pandas as pd
import re
from dateutil import parser
from pathlib import Path
from string import punctuation
import math

In [2]:
# Number of chunks to seperate the data into based on time. 
# For example, if num_blocks = 3, seperate chat into beginning, middle and end
num_blocks = 3

In [3]:
# Parse NEK19 Chat
nek19_chat_df = pd.DataFrame()

for session in range(1,9):
    f_out = "data/chats/NEK19/NEK19_{}.txt".format(session)
    s_df = pd.read_json(f_out)
    s_df = s_df.drop(columns=['_id', 'timeZone'])

    # parse out time said and assign beginning, middle, end labels
    s_df['creationDateTime'] = s_df['creationDateTime'].apply(parser.parse)
    delt = s_df['creationDateTime'].iloc[-1]  - s_df['creationDateTime'].iloc[0]
    def aux(time):
        v = int(((time - s_df['creationDateTime'].iloc[0])/delt) * num_blocks)
        if(v == num_blocks): v = num_blocks-1
    s_df['block'] = s_df['creationDateTime'].map(aux)

    s_df['session'] = session
    nek19_chat_df = pd.concat([nek19_chat_df, s_df])
nek19_chat_df = nek19_chat_df.drop('creationDateTime',axis=1).reset_index(drop=True)

In [4]:
# Parse NEK21 Chat
nek21_chat_df = pd.DataFrame()
session_numbers = list(range(1,20))
session_numbers.remove(2)
for session in session_numbers:
    try:
        f_out = "data/chats/NEK21/NEK21_{}.txt".format(session)
        s_df = pd.read_csv(f_out)
        s_df = s_df.drop(columns=['_id', 'timeZone'])

        # parse out time said and assign beginning, middle, end labels
        s_df['creationDateTime'] = s_df['creationDateTime'].apply(parser.parse)
        delt = s_df['creationDateTime'].iloc[-1]  - s_df['creationDateTime'].iloc[0]
        def aux(time):
            v = int(((time - s_df['creationDateTime'].iloc[0])/delt) * num_blocks)
            if(v == num_blocks): v = num_blocks-1

            return v
        s_df['block'] = s_df['creationDateTime'].map(aux)
        s_df['session'] = session
        nek21_chat_df = pd.concat([nek21_chat_df, s_df])
    except:
        print("Session {} not found".format(session))

nek21_chat_df = nek21_chat_df.drop(['creationDateTime', 'Type_MessageDTO'],axis=1).reset_index(drop=True)

Session 19 not found


In [5]:
# Marges nek19 and 21 into one dataframe, saves it 
# mark nek19 sessions as negative to differenciate between NEK19 and 21
nek19_chat_df['session'] = 1900 + nek19_chat_df['session']
nek21_chat_df['session'] = 2100 + nek21_chat_df['session']
chat_df = pd.concat([nek19_chat_df, nek21_chat_df])
chat_df.to_csv("data/chat.csv")

In [6]:
def check_trans(word_list, messages):
    all_words = {}
    counter = {} # includes actual words from conversation
    dict_counter = {} # includes words from dictionary
    
    for message in messages:
        content = message.split(" ")
        for word in content:
            word = word.strip(punctuation).lower()
            if len(word)>1:
                if word in all_words:
                    all_words[word] += 1
                elif word.isalpha() == True:
                    all_words[word] = 1
                else:
                    if word[0].isnumeric() == False:
                        for symbol in punctuation:
                            if symbol in word:
                                split_word = word.split(symbol)
                                for section in split_word:
                                    if len(section) > 1:
                                        if word in all_words:
                                            all_words[word] += 1
                                        else:
                                            if word.isalpha() == True:
                                                all_words[word] = 1
                for check in word_list:
                    find = re.match(check, word)
                    if find != None:
                        if check[-1] != "*":
                            if len(word) > find.span()[1]:
                                continue
                        else:
                            if len(word) < len(check):
                                continue
                        if word not in counter:
                            counter[word] = 1
                        else:
                            counter[word] += 1
                        if check not in dict_counter:
                            dict_counter[check] = 1
                        else:
                            dict_counter[check] += 1
    return counter, dict_counter, all_words

In [7]:
# Helper functions to parse transcripts, adapted from megan's code
# Input: path to directory full of transcripts
# Output: a DataFrame of all the utterances in the transcript and their speaker
def parse_trans(path):

   tran_path = Path(path)
   tran_list = list(tran_path.glob('*.txt'))
   transcripts = {}
   time = 0
   for filepath in tran_list:
      name = Path(filepath).stem
      with open(filepath,'r',encoding='utf-8') as my_file:
         data = my_file.readlines()
         clean = []
         for line in data:
            if ":" in line and line[:1].isnumeric() == False:
               clean.append(line.strip() + "&&" +  str(time))
            elif ":" in line and line[:1].isnumeric() == True:
               time +=1
      transcripts[name] = clean


   last_time = 0
   trans_df = pd.DataFrame()
   for session in transcripts:
      s_df = pd.DataFrame.from_dict(transcripts[session])
      def trans_proccess(c): 

         global g_time
         x=c[0].strip()
         # print(x)
         try:
            speaker = re.search(r"^[^:]*:\s*", x).group()[:-2]
         except:
            speaker = ""
         try:
            content = re.search(r":(.*)&&", x).group()[1:-2]
         except:
            content = ""
         try:
            time = int(re.search(r"&&.*", x).group()[2:])
            g_time = time
         except:
            time = -1

         if(speaker == "" and content == ""):
            speaker = None
            content = None

         row = pd.Series(dtype='float64')

         row['speaker'] = speaker
         row['content'] = content
         row['block'] = time-last_time 
         row['session'] = session
         return row
      trans_df = pd.concat([trans_df, s_df.apply(lambda x: trans_proccess(x), axis=1)])
      last_time = g_time
   trans_df = trans_df.dropna()
   trans_df = trans_df[trans_df['block'] >= 0].reset_index(drop=True)
   trans_df['block'] = (trans_df['block'] / trans_df['block'].max() * num_blocks).apply(math.floor)
   trans_df['session'] = trans_df['session'].astype('int')
   return trans_df

In [8]:
# Parse all transcripts
trans19 = parse_trans("data/transcripts/NEK19")
trans21 = parse_trans("data/transcripts/NEK21")
trans21_mag = parse_trans("data/transcripts/NEK21_MAG")

In [9]:
# codding sessions, if the session is negative, it is NEK19, if it is 0-100, it is NEK21, if it is 100+, it is a mag session
trans19['session'] = 1900 + trans19['session']
trans21['session'] = 2100 + trans21['session']
trans21_mag['session'] = 12100 + trans21_mag['session']

# Merge into one master file
trans_df = pd.concat([trans19, trans21, trans21_mag])
trans_df = trans_df[trans_df['speaker'] != "Legen"]

In [11]:
# Save Data
trans_df.to_csv("data/trans.csv")

Unnamed: 0,speaker,content,block,session
0,Nastya,Ninety and slash or?,0,1903
1,Ryan to Zheny,And then just your number. So [UI],0,1903
2,Natash,No…,0,1903
3,Zheny,[UI],0,1903
4,Ali,You have a password.,0,1903
...,...,...,...,...
8990,Will,[SL] It’s alright.,0,12111
8991,Oleg,//They scattered away.,0,12111
8992,Vika,//Bye guys.,0,12111
8993,Ashley,Bye everyone. //It was a great session.,0,12111
