In [None]:
import json

In [None]:
# Conversation (thread) properties:
#
# participants: object array:
#        name: string
#
# messages: object array
#        sender_name: string - coincides with one of the participants
#        timestamp_ms: integer - unix time in ms
#        content: string
#        type: string - we'll focus on Generic (basic text)
#
# title: string
#
# is_still_participant: boolean
#
# thread_type: string
#
# thread_path: string

In [None]:
file = 'messages_1.json'

In [None]:
with open(file) as jsonfile:
    data = json.load(jsonfile)
    participants = [p['name'] for p in data['participants']]
    messages_all = data['messages']
    messages_generic = [message for message in messages_all if message['type'] == 'Generic' and 'content' in message]
    messages = messages_generic
    
    print(participants)
    print(messages[0])

In [None]:
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(zip(wordlist,wordfreq))

def sortFreqDict(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux

# get word frequency for given participant
def word_freq(participant, messages):
    messages_of_participant = []
    for message in messages:
        if message['sender_name'] == participant:
            messages_of_participant.append(message['content'])
    
    content = []
    for message in messages_of_participant:
        content.extend(list(map(str.lower, message.split())))
        
    freqdict = wordListToFreqDict(content)
    sortdict = sortFreqDict(freqdict)
        
    for entry in sortdict:
        print(entry)

In [None]:
word_freq(participants[0], messages)

In [None]:
# questions for a participant
def question_count(participant, messages):
    messages_of_participant = []
    for message in messages:
        if message['sender_name'] == participant:
            messages_of_participant.append(message['content'])
    
    count = 0
    
    for message in messages_of_participant:
        if '?' in message:
            count += 1
            
    return count

In [None]:
for p in participants:
    print(p, 'question count:', question_count(p, messages))

In [None]:
# conversation starter
def conversation_starter(participants, messages, interval=3600):
    length = len(messages)
    interval *= 1000
    count = {}
    for p in participants:
        count[p] = 0
        
    # set the first conversation
    count[messages[length - 1]['sender_name']] = 1
    
    for i in range(length - 2, -1, -1):
        crt_int = messages[i]['timestamp_ms'] - messages[i + 1]['timestamp_ms']
        if crt_int > interval and messages[i]['sender_name'] != messages[i + 1]['sender_name']:
            count[messages[i]['sender_name']] += 1
            
    return count

In [None]:
interval = 3600
convo = conversation_starter(participants, messages, interval)
print('Conversations started (1 hour for a new convo):')
for p in convo:
    print(p, convo[p])

In [None]:
interval = 24 * 3600
convo = conversation_starter(participants, messages, interval)
print('Conversations started (1 day for a new convo):')
for p in convo:
    print(p, convo[p])

In [None]:
interval = 48 * 3600
convo = conversation_starter(participants, messages, interval)
print('Conversations started (2 days for a new convo):')
for p in convo:
    print(p, convo[p])

In [None]:
def avg_response_time(participants, messages, interval=3600):
    interval *= 1000
    length = len(messages)
    count = {}
    for p in participants:
        count[p] = {}
        count[p]['ttime'] = 0
        count[p]['responses'] = 0
        
    for i in range(length - 2, -1, -1):
        crt_int = messages[i]['timestamp_ms'] - messages[i + 1]['timestamp_ms']
        if crt_int < interval and messages[i]['sender_name'] != messages[i + 1]['sender_name']:
            count[messages[i]['sender_name']]['responses'] += 1
            count[messages[i]['sender_name']]['ttime'] += crt_int
            
    for p in participants:
        count[p]['avg_time'] = round(count[p]['ttime'] / (count[p]['responses'] * 1000), 3)
            
    return count

In [None]:
interval = 300
avgtime = avg_response_time(participants, messages, interval)
print('Average response time (with response time no higher than 5 mins):')
for p in avgtime:
    print(p, avgtime[p]['avg_time'], 'sec')

In [None]:
def message_percentage(participants, messages):
    count = {}
    for p in participants:
        count[p] = {}
        count[p]['number'] = 0
        
    for m in messages:
        count[m['sender_name']]['number'] += 1
        
    arrlen = len(messages)
    
    for p in participants:
        count[p]['percentage'] = round(count[p]['number']/arrlen * 100, 2)
        
    return count
    

In [None]:
percentages = message_percentage(participants, messages)
for p in percentages:
    print(p, percentages[p]['percentage'], '% of messages')

In [None]:
def avg_message_len(participants, messages):
    count = {}
    for p in participants:
        count[p] = {}
        count[p]['number'] = 0
        count[p]['tlength'] = 0
        
    for m in messages:
        count[m['sender_name']]['number'] += 1
        count[m['sender_name']]['tlength'] += len(m['content'])
    
    for p in participants:
        count[p]['avglen'] = round(count[p]['tlength']/count[p]['number'], 2)
        
    return count

In [None]:
avglen = avg_message_len(participants, messages)
print('Average message length:')
for p in avglen:
    print(p, avglen[p]['avglen'], 'characters')

In [None]:
def average_convo(messages, interval=3600):
    interval *= 1000
    length = len(messages)
    
    convos = 0
    for i in range(length - 2, -1, -1):
        crt_int = messages[i]['timestamp_ms'] - messages[i + 1]['timestamp_ms']
        if crt_int > interval:
            convos += 1

    return round(len(messages)/convos, 2)

In [None]:
print('Average conversation length (a convo delimited by 1 hour pauses):', average_convo(messages), 'messages')