In [111]:
import os
import nltk
from nltk.util import ngrams
import pickle
import numpy as np
import matplotlib.pyplot as plt
from itertools import chain
from scipy.stats import ttest_rel
import pandas as pd

from tools.corpus_reader import CHILDESCorpusReader
from tools.functions import nansem, turns, turn_pairs
from tools.measures import dlg_mul, dlg_mwl, shared_ngrams, diff_matrix, \
conv_matrix, global_rate, local_rate

np.random.seed(42)

# read Brown corpus
nltk.data.path.append(os.getcwd() + '/data/nltk_data/')
corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA')
brown = CHILDESCorpusReader(corpus_root, 'Brown/.*.xml')

# dialogue files for Adam and Sarah
adam_dlgs = [fileID for fileID in brown.fileids() if 'Adam' in fileID]
sarah_dlgs = [fileID for fileID in brown.fileids() if 'Sarah' in fileID]

In [331]:
adam_measures = pickle.load(open('data/pickles/adam-all-measures-v3.pkl', 'rb'))
adam_sem_RR = pickle.load(open('adam_RRs.pkl', 'rb'))
adam_sem_RR_shf = pickle.load(open('adam_RRs_shf.pkl', 'rb'))
sarah_measures = pickle.load(open('data/pickles/sarah-all-measures-v3.pkl', 'rb'))
sarah_sem_RR = pickle.load(open('sarah_RRs.pkl', 'rb'))
sarah_sem_RR_shf = pickle.load(open('sarah_RRs_shf.pkl', 'rb'))

In [113]:
adam_measures['PosBi'].keys()

dict_keys(['shuffled', 'RR2pos', 'shLocRR', 'LocRR', 'GlobRR', 'RR2neg', 'matrix'])

In [114]:
adam_measures.keys()

dict_keys(['PosBi', 'LexUni', 'LexBi', 'MWL', 'MUL'])

In [211]:
adam_sem_RR

{27.13: {0: 0.5574871449413912,
  1: 0.5103275867651648,
  2: 0.48444471173322523,
  3: 0.47332230226632277,
  4: 0.46563747188672344,
  5: 0.45926327921171073,
  6: 0.4549225107785367,
  7: 0.45170511955479115,
  8: 0.44874022622009857,
  9: 0.4468193479438182,
  10: 0.44513316606946013},
 27.59: {0: 0.5612635340196667,
  1: 0.525032156748566,
  2: 0.49713821414306364,
  3: 0.4822173615441892,
  4: 0.4731964041790174,
  5: 0.46632319496721564,
  6: 0.46138285395115275,
  7: 0.4574116053748174,
  8: 0.45366062116138917,
  9: 0.4505517373755739,
  10: 0.44794798153502335},
 28.1: {0: 0.5434047200404812,
  1: 0.5187131876125932,
  2: 0.48979703537946273,
  3: 0.4734248051052842,
  4: 0.46103339567446283,
  5: 0.4525675160069988,
  6: 0.4471871478844629,
  7: 0.44184688635939756,
  8: 0.43639626922553054,
  9: 0.4326880437336484,
  10: 0.4293996376419046},
 28.49: {0: 0.5658634455084508,
  1: 0.5325034676271703,
  2: 0.50573718638709,
  3: 0.48974305933309464,
  4: 0.47980888727980714,
  

In [263]:
sarah_sem_RR[27.16][2] - sarah_sem_RR_shf[27.16][2]

0.018399089650454226

In [344]:
adam_measures.keys()

dict_keys(['PosBi', 'LexUni', 'LexBi', 'MWL', 'MUL', 'MWT', 'MCT', 'MTL', 'MUW'])

In [349]:
from scipy import stats

def dict2csv(child, child_dict, sem_dict, sem_dict_shf, d):
    result = {}
    for measure in child_dict.keys():
        if measure != 'ages':
            if measure in ['MWL', 'MCT', 'MTL']:
                continue
                
            if measure in ['MWT', 'MUW', 'MUL']:
                
                print(measure)
            
            # Complexity metrics
#             if measure in ['MWL', 'MUL', 'MWT', 'MCT', 'MTL', 'MUW']:
            
                
                # Child complexity
                measure_dict = child_dict[measure]['CHI']
                complexity_measure = np.asarray([x[0] for x in measure_dict])
                complexity_measure = stats.zscore(complexity_measure)
#                 measure_name = measure + '_CHI'
                if 'CHI_CP' in result.keys():
                    result['CHI_CP'] += complexity_measure
                else:
                    result['CHI_CP'] = complexity_measure
                
                # Adult complexity
                measure_dict = child_dict[measure]['ADT']
                complexity_measure = np.asarray([x[0] for x in measure_dict])
                complexity_measure = stats.zscore(complexity_measure)
#                 measure_name = measure + '_ADT'
                if 'ADT_CP' in result.keys():
                    result['ADT_CP'] += complexity_measure
                else:
                    result['ADT_CP'] = complexity_measure

            # Recurrence Rates
            else:
                measure_dict = child_dict[measure]
                LocRR = measure_dict['LocRR'][d]
                shLocRR = measure_dict['shLocRR'][d]
                difference = np.asarray([x-y for x,y in zip(LocRR, shLocRR)])
                difference = stats.zscore(difference)
                result[measure] = difference
    
    # Semantic Recurrence
    LocRR = []
    shLocRR = []
    ages = []
    for age in sorted(sem_dict.keys()):
        ages.append(age)
        LocRR.append(sem_dict[age][d])
    for age in sorted(sem_dict_shf.keys()):
        shLocRR.append(sem_dict_shf[age][d])
    difference = np.asarray([x-y for x,y in zip(LocRR, shLocRR)])
#     difference = np.asarray([x for x in shLocRR])
    difference = stats.zscore(difference)
    result['Semantic'] = difference
    result['Age'] = ages
    
    # Save as .csv
    df = pd.DataFrame.from_dict(result)
    filename = 'data/pickles/{}-measures-with-d-{}.csv'.format(child, d)
    df.to_csv(path_or_buf=filename,index=False)
    

# Generate .csv files for different values of d
for d in range(0,10):
    adam_csv = dict2csv(child='adam', child_dict=adam_measures, sem_dict=adam_sem_RR, sem_dict_shf=adam_sem_RR_shf, d=d)
    sarah_csv = dict2csv(child='sarah', child_dict=sarah_measures, sem_dict=sarah_sem_RR, sem_dict_shf=sarah_sem_RR_shf, d=d)


MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW
MUL
MWT
MUW


In [348]:
df = pd.read_csv(filepath_or_buffer='data/pickles/adam-measures-with-d-2.csv')
# df = df[['Age', 'PosBi', 'LexUni', 'LexBi', 'MWL', 'MUL', 'Semantic',]]
# df = df [['Age', 'Semantic', 'PosBi', 'LexUni', 'LexBi', 'ADT_CP', 'CHI_CP']]
df = df[['PosBi', 'LexUni', 'LexBi', 'CHI_CP', 'ADT_CP', 'Semantic', 'Age']]
df

Unnamed: 0,PosBi,LexUni,LexBi,CHI_CP,ADT_CP,Semantic,Age
0,-0.453835,0.996836,0.073726,-5.353068,-2.753983,0.348852,27.13
1,0.435674,2.961933,-1.455398,-4.966839,-2.669624,2.025746,27.59
2,1.643331,2.935712,0.732876,-3.973941,3.511968,2.758193,28.1
3,-1.476153,0.29021,-1.643198,-5.708526,-2.266379,1.215631,28.49
4,1.146419,1.519873,-0.853038,-4.529027,-1.328198,2.322423,28.99
5,0.79735,1.137541,1.339858,-4.946786,-2.997372,0.813892,29.39
6,1.273368,1.59367,2.470486,-3.385214,-4.829692,2.101941,30.1
7,-0.12159,0.942183,0.354429,-2.206566,1.080973,1.012735,30.56
8,1.34946,1.448896,0.925696,-3.505647,-3.603724,1.804688,31.03
9,0.194892,0.353434,0.775587,-3.587433,-3.549666,0.208278,31.46


In [163]:
adam_all_measures['MTL']['CHI']

[(3.536312849162011, 0.1995013432767657),
 (2.3740740740740742, 0.09476049219728296),
 (2.469914040114613, 0.11385617112277271),
 (2.193820224719101, 0.10357615791013726),
 (2.2698412698412698, 0.10315683575398157),
 (2.7035830618892507, 0.16622038637664693),
 (2.116504854368932, 0.08268145250342135),
 (2.203333333333333, 0.1005869809704056),
 (2.0658823529411765, 0.08458014504770886),
 (2.2900943396226414, 0.11884823128427749),
 (2.2004950495049505, 0.10165886788785948),
 (1.8525798525798525, 0.06928375716068826),
 (1.6201780415430267, 0.062482660685418),
 (1.7597597597597598, 0.08084610489248646),
 (2.2916666666666665, 0.13207770898702886),
 (1.9819277108433735, 0.08645635961626202),
 (1.799163179916318, 0.07127476180688973),
 (1.9384920634920635, 0.06487498315971485),
 (1.8480565371024735, 0.06203735640304152),
 (2.271948608137045, 0.12416861757099265),
 (3.2244897959183674, 0.2207266446793476),
 (3.1167192429022084, 0.2636039118466933),
 (1.859922178988327, 0.06522340961240716),
 (

In [131]:
adam_all_measures = pickle.load(open('data/pickles/adam-all-measures.pkl', 'rb'))

In [137]:
adam_all_measures['PosBi'].keys()

dict_keys(['shuffled', 'RR2pos', 'shLocRR', 'LocRR', 'GlobRR', 'RR2neg', 'matrix'])

In [33]:
b = [1,2,3,4,5]
c = [3,4,5,6,7]
b - c

TypeError: unsupported operand type(s) for -: 'list' and 'list'