In [2]:
#!/usr/bin/env python

import json, os, glob
from pprint import pprint
import re

from collections import defaultdict
import nltk

In [3]:
# variables
id_to_name = defaultdict(str)
doc_sentences = defaultdict(lambda: []) # each user (key) has a dict of sentences (value)

In [4]:
######## read_sentences.ipynb ########

# read data from old transcript
with open('transcript-15528094.json') as f:
    data = json.load(f)

for item in data:
    if item['text'] != None:
        
        user_id = item['user_id']
        message = item['text']
        
        # ignore system and calendar messages
        if user_id == 'system' or user_id == 'calendar':
            continue
        
        # Split message if it contains newlines
        sents = message.split('\n')
        
        # separate message (or lines) into sentences
        sents = [item for s in sents for item in re.split('(?<=[.!?]) +',s)]
        # regex from https://stackoverflow.com/questions/14622835/split-string-on-or-keeping-the-punctuation-mark
        
        # add sentences to dict
        for sentence in sents:
            if len(sentence) == 0:
                # skip any empty sentences
                continue
            doc_sentences[user_id].append(sentence)
        
        # potentially add user to id_to_name dict
        if user_id not in id_to_name:
            id_to_name[user_id] = item['name']

In [5]:
# print each user's sentences to a different file
# each sentence on a new line

for user in doc_sentences:
    filename = str(user) + '_sentences.txt'
    outfile = open(filename, "w")
    
    sentences = doc_sentences[user]
    for sentence in sentences:
        outfile.write(sentence + "\n")
    outfile.close()

In [6]:
####### parse_sentences.ipynb #########

# now do sentence complexity analysis

# set parser environment vars
os.environ['STANFORD_PARSER'] = '~/Downloads/stanford-parser-full-2018-10-17'
os.environ['STANFORD_MODELS'] = '~/Downloads/stanford-parser-full-2018-10-17'
os.environ['CLASSPATH'] = '~/Downloads/stanford-parser-full-2018-10-17/*'

# start up CoreNLP server
parser = nltk.parse.corenlp.CoreNLPParser(url='http://localhost:9000')

In [7]:
# keep track of tree height and count
counts = defaultdict(lambda: [0,0]) # keep a tuple: total, count

# to store averages
average_depth = defaultdict(float)

In [None]:
data_path = glob.glob('*_sentences.txt')

for filename in data_path:
    
    curr_file = open(filename, "r")
    user = filename.split("_")[0]

    for line in curr_file:
        
        # skip lines that contain unicode
        if any(ord(c) >= 128 for c in line):
            continue
        
        # TODO: add try-catch here
        parsed = parser.parse(line.split())
        # from answer by alvas on https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk
        # also here: https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK

        tree = next(parsed)
        # TODO: output parsed sentences into a file
        height = tree.height()
        
        # update height total
        counts[user][0] += height
        # update count
        counts[user][1] += 1
        
    average_depth[user] = counts[user][0]/counts[user][1]

In [9]:
# dict of names for each id
fake_name = {'10820250': 'Jack',
             '13207027': 'John',
             '14137183': 'Peter',
             '18559389': 'Anna',
             '18559680': 'Ethan',
             '19458971': 'Alex',
             '20963086': 'Kylie',
             '21235741': 'Jim',
             '21235813': 'Nathan',
             '21241679': 'Aaron',
             '21349729': 'Sarah',
             '21755924': 'Bob',
             '22207515': 'Claire',
             '22276941': 'Sally',
             '23354087': 'Mike',
             '24927292': 'Tim',
             '26498711': 'Nina',
             '26508082': 'Jacob',
             '26514517': 'Lisa',
             '26601370': 'Mitch',
             '270093': 'Alan',
             '270280': 'Tom',
             '27380802': 'Jill',
             '27546073': 'Walter',
             '28068527': 'Simon',
             '28222063': 'Annie',
             '28880161': 'Chris',
             '29652195': 'Nancy',
             '29666163': 'Michael',
             '29747722': 'Jason',
             '29750591': 'Trey',
             '29775461': 'Nicole',
             '29775466': 'Liza',
             '29818898': 'Thomas',
             '29846726': 'Isaac',
             '45033570': 'Patrick',
             '46185459': 'Zo',
             '51427894': 'Natalie'}

In [10]:
######## results.ipynb ###########

#save_results = open("save_results.txt", "a")

# print sorted list in latex table format
for key_value_pair in sorted(average_depth.items(),
           key=lambda k_v: k_v[1],
           reverse=True):
        
        user = key_value_pair[0]
        average = key_value_pair[1]
        
        print("{} & {:.3f} \\\\".format(fake_name[user], average))
        #save_results.write(fake_name[user] + "\t" + str(average) + "\n")

print("\nlength: " + str(len(average_depth)))
#save_results.close()

Alan & 6.833 \\

length: 1
