In [None]:
import os
import pkg_resources
import json
import itertools
import spacy
import numpy as np
import pandas as pd
from convokit import Corpus, download

In [None]:
corpus = Corpus(filename=download("conversations-gone-awry-corpus"))
len(corpus.utterances)

In [None]:
corpus.print_summary_stats()

In [None]:
comment_ids = []
convo_ids = []
timestamps = []
page_ids = []
awry = []
context = []
for comment_id in corpus.utterances:
    comment = corpus.utterances[comment_id]
    # section headers are included in the dataset for completeness, but for prediction we need to ignore
    # them as they are not utterances
    if not comment.meta["is_section_header"]:
        comment_ids.append(comment_id)
        convo_ids.append(comment.conversation_id)
        timestamps.append(comment.timestamp)
        #page_ids.append(comment.meta["awry_info"]["page_id"])
        #awry.append(comment.meta["awry_info"]["conversation_has_personal_attack"])
        context.append(comment.reply_to)
comment_df = pd.DataFrame({"conversation_id": convo_ids, "timestamp": timestamps, "awry": True, "context":context}, index=comment_ids)

In [None]:
comment_df

In [None]:
comment_df[comment_df.context=="None"]

In [None]:
comm_distinct_df = comment_df.drop_duplicates(subset=['conversation_id']).drop(['timestamp'],axis=1)
conv_distinct_df = pd.DataFrame({'conversation_id': list(comm_distinct_df['conversation_id']), 'awry': list(comm_distinct_df['awry'])})

In [None]:
import progressbar

In [None]:
conversations = []
conversations_dict = {}

for index, row in comment_df.iterrows():
    if row['conversation_id'] in conversations_dict.keys():
        conversations_dict[row['conversation_id']].append(index)
    else:
        conversations_dict[row['conversation_id']]=[index]
        
for index, row in conv_distinct_df.iterrows():
    conversations.append({'conversation_id': row['conversation_id'], 'utterances': [i for i in conversations_dict[row['conversation_id']]], 'awry': row['awry']})
    
conversations

In [None]:
len([t for i in conversations for t in i['tokens']])

In [None]:
corpus.utterances

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(["n't","'s","'m"])
punctuations="?:!.,;'[]{}\/-|`<>*()_"

for i in conversations:
    for j in i['utterances']:
        if not 'texts' in i:
            i['texts'] = [corpus.utterances[j].text]
        else:
            i['texts'].append(corpus.utterances[j].text)
conversations

In [None]:
import nltk
for i in conversations:
    for j in i['texts']:
        filtered_sentence = []
        words = nltk.word_tokenize(j)
        for w in words:
            if((w not in stop_words) and (w not in punctuations)):
                filtered_sentence.append(w)
        result = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in filtered_sentence]
        if not 'tokens' in i:
            i['tokens'] = [result]
        else:
            i['tokens'].append(result)
conversations

In [None]:
emowordnet = pd.read_csv("emowordnet.csv", sep=';')
emowordnet

In [None]:
for i in conversations:
    maxlist = []
    avglist = []
    for j in i["tokens"]:
        emostats = [0,0,0,0,0,0,0,0]
        emomax = 0
        length = len(j)
        for word in j:
            result = emowordnet[emowordnet['Lemma']==word.lower()]
            if(len(result)!=0):
                result = result.iloc[0]
                emolist = [result['AFRAID'], result['AMUSED'], result['ANGRY'], result['ANNOYED'], result['DONT_CARE'], result['HAPPY'], result['INSPIRED'], result['SAD']]
                emostats = [x + y for x, y in zip(emostats, emolist)]
                if max(emolist)>emomax:
                    emomax = max(emolist)
            else:
                length-=1
        maxlist.append(emomax)
        avglist.append(max(emolist)/max(length,1))
    i['max'] = maxlist
    i['avg'] = avglist

conversations

In [None]:
len(conversations)

In [None]:
with open('conversations_time_series.json', 'w') as file:
    json.dump(conversations, file)

In [None]:
def get_context(index, row, context_df):
    context = []
    indices = list(context_df.index)
    if row["context"] in indices:
        reply_to = row["context"]
        context.append(reply_to)
        new_row = context_df.loc[reply_to]
        print(reply_to,new_row)
        context.extend(get_context(reply_to, new_row, context_df))
    else:
        return context

In [None]:
list(comment_df.index)

In [None]:
conversations_context_dict = {}
for index, row in comment_df.iterrows():
    conversations_context_dict[index] = []
    
for index, row in comment_df.iterrows():
    #print(index,conversations_context_dict[reply_to],reply_to)
    if row["context"] not in conversations_context_dict.keys():
        conversations_context_dict[index] = []
    else:
        conversations_context_dict[index] = [conversations_context_dict[row["context"]],row["context"]]
conversations_context_dict

In [None]:
from collections.abc import Iterable

def flatten(l):
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

In [None]:
for k,v in conversations_context_dict.items():
    conversations_context_dict[k] = list(flatten(v))

In [None]:
conversations_context_dict

In [None]:
conversations_context = []
for k,v in conversations_context_dict.items():
    #print(v,k)
    conversations_context.append(v+[k])
conversations_context    

In [None]:
one_element = []
for i in conversations_context:
    if(len(i)==1):
        one_element.append(i)
#print(len(conversations_context),len(one_element))
conversations_context = ([item for item in conversations_context if item not in one_element])
len(conversations_context)

In [None]:
conversations_context

In [None]:
def subfinder(mylist, pattern):
    matches = []
    for i in range(len(mylist)):
        if mylist[i] == pattern[0] and mylist[i:i+len(pattern)] == pattern:
            matches.append(pattern)
    return matches

In [None]:
to_remove = []
for i in range(1,len(conversations_context)):
    if len(subfinder(conversations_context[i],conversations_context[i-1]))!=0: 
        to_remove.append(conversations_context[i-1]) 
conversations_context = [e for e in conversations_context if e not in to_remove]
conversations_context

In [None]:
conversations_context_list = []
for i in conversations_context:
    for j in conversations:
        if i[0] in j['utterances']:
            conversations_context_list.append({"conversation_id": j['conversation_id'], "awry": j["awry"],
                                              "discussion": i, 
                                              "max": [j['max'][j['utterances'].index(k)] for k in j['utterances']
                                                      for l in i if k==l], 
                                              "avg": [j['avg'][j['utterances'].index(k)] for k in j['utterances']
                                                      for l in i if k==l]})

In [None]:
conversations_context_list

In [None]:
with open('conversations_context_time_series.json', 'w') as file:
    json.dump(conversations_context_list, file)