# Comparing Vocabulary used in Control vs. Activity Video Conditions


In [1]:
import spacy
import os
import string
import pandas as pd
import numpy as np
from lexical_diversity import mtld, hdd

nlp = spacy.load("en_core_web_md")

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import tokenize
from statistics import mean
lemmatizer = WordNetLemmatizer()

In [3]:
e1cond = pd.read_csv("exp1_lexical_diversity.csv")
e2cond = pd.read_csv("exp2_lexical_diversity.csv")

In [4]:

# a function to clean text in brackets
def clean(string):
    ret = ''
    skip1c = 0
    for i in string:
        if i == '[':
            skip1c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif skip1c == 0:
            ret += i
    return ret

def find_id(conds, fname):
    for i in range(len(conds["sid"])):
        val = fname.find(conds["sid"][i])
        if(val!=-1):
            return(conds["sid"][i])
    return("NA")

def type_token_count(text):
    # remove all punctuations
    text_ready = list(text)
    n_words = len(text_ready)
    for i in range(n_words):
        for c in string.punctuation:
            text_ready[i] = text_ready[i].replace(c,'')
    # remove empty words
    text_ready = list(filter(None, text_ready))
    text_ready = ''.join(text_ready)
    token_value = len(str.split(text_ready))
    word_list = nltk.word_tokenize(text_ready)
    #lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    #print(lemmatized_output)
    #type_value = len(set(str.split(lemmatized_output)))
    type_value = len(set(word_list))
    return type_value, token_value

# formatted mother utterances (each utterance should be in the same one paragraph) by hand
# "Female speaker" "*Other Child" "*OTH" "*BOTH" "*ADF,ADM" refer to speakers other than parent/child
# should replace all "CHI 2:" with "CHI:"
def load_transcripts(folder, conds):
    col_names = ['sid', 'MTLD', 'HDD', 'types', 'tokens', 'TTR', 'len_sentence']
    results = pd.DataFrame(columns = col_names)
    df = pd.DataFrame(columns=['sid','cond','text'])
    df['cond'] = ""
    conds["types2"] = np.nan
    conds["tokens2"] = np.nan
    conds["MTLD2"] = np.nan
    conds["TTR2"] = np.nan
    files = os.listdir(folder)
    try:
        files.remove('.DS_Store')
    except:
        pass
    docs = {} # for spacy analysis
    # print (files)
    for fname in files:
        name =  fname
        path = "".join((folder,name))
        fr = open(path, 'r')
        lines = []
        for line in fr:
            spm = line.split('*MOT:')
            spm_1 = line.split('MOT:')
            spd = line.split('*FAT:')
            spd_1 = line.split('FAT:')
            if len(spm)>1:
                lines.append(spm[1]) # mother speech
            elif len(spd)>1:
                lines.append(spd[1]) # father speech
            elif len(spd_1)>1:
                    lines.append(spd_1[1])
            elif len(spm_1)>1:
                lines.append(spm_1[1])
        lines_str = " ".join(lines)
        lines_str = lines_str.replace ('\n', '')
        lines_str = lines_str.replace ('\t', '')
        text_ready = clean(lines_str)
        
        type_value, token_value = type_token_count(text_ready)
        mtld_value = mtld(text_ready.split())
        
        id = find_id(conds, fname)
        #print(id)
        #print(conds.loc[conds['sid'] == id])
        conds.loc[conds['sid'] == id, 'text'] = text_ready
        conds.loc[conds['sid'] == id, 'types2'] = type_value
        conds.loc[conds['sid'] == id, 'tokens2'] = token_value
        conds.loc[conds['sid'] == id, 'MTLD2'] = mtld_value
        conds.loc[conds['sid'] == id, 'TTR2'] = type_value / token_value
        #docs[id] = nlp(text_ready) # spacy container for document and all annotations
    return conds

#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.is_stop)

In [5]:
e1docs = load_transcripts("exp1_trans_out/", e1cond)
e2docs = load_transcripts("exp2_trans_out/", e2cond)

NameError: name 'lemmatized_output' is not defined

In [29]:
e1docs[['sid','age','AA','EL','RR','video','gender','parent_ed','condition','types','types2','tokens','tokens2','TTR','TTR2','MTLD','MTLD2']].to_csv('exp1_conditions2.csv')

In [30]:
e2docs[['sid','age','video','gender','parent_ed','condition','types','types2','tokens','tokens2','TTR','TTR2','MTLD','MTLD2']].to_csv('exp2_conditions2.csv')

In [8]:
# for fname in files

def doc_to_df(doc):
    cols = ("text", "lemma", "POS", "explain", "stopword")
    rows = []
    sdoc = nlp(doc)
    for t in sdoc:
        row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.is_stop]
        rows.append(row)
    df = pd.DataFrame(rows, columns=cols)
    return(df)

In [9]:
def utterance_count_and_length(doc, verbose=True):
    if verbose:
        for sent in docs[fname].sents:
            print(">", sent)
    return((len(doc.sents), len(doc) / len(doc.sents)))

In [10]:
#print(token.text, token.lemma_, token.pos_)

In [11]:
#for chunk in doc.noun_chunks:
#    print(chunk.text)

In [12]:
# named entities
#for ent in docs[fname].ents:
#    print(ent.text, ent.label_)

In [13]:
from spacy import displacy
#displacy.render(docs[fname], style="ent") # display dependencies

from collections import Counter

# iterate over docs[fname] and concatenate all

def count_tokens_nouns_verbs(doc):
    # all tokens, excluding stop words and punctuation
    words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    nouns = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN"]
    verbs = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB"]
    return (Counter(words), Counter(nouns), Counter(verbs))
    
#word_freq, noun_freq, verb_freq = count_tokens_nouns_verbs(combined_)
#noun_freq.most_common(5)

In [14]:
import scattertext as st

# should I do these merges or not? (maybe not for finding single tokens..)
#if "merge_entities" not in nlp.pipe_names:
#    nlp.add_pipe(nlp.create_pipe("merge_entities"))

#if "merge_noun_chunks" not in nlp.pipe_names:
#    nlp.add_pipe(nlp.create_pipe("merge_noun_chunks"))


e1corpus = st.CorpusFromPandas(e1docs, category_col="condition", text_col="text", nlp=nlp).build()

e2corpus = st.CorpusFromPandas(e2docs, category_col="condition", text_col="text", nlp=nlp).build()


In [15]:
from pprint import pprint

def print_indicative_terms(corpus):    
    term_freq = corpus.get_term_freq_df()
    term_freq['Activity Video Score'] = corpus.get_scaled_f_scores('exp')
    pprint(list(term_freq.sort_values(by='Activity Video Score', ascending=False).index[:10]))
    term_freq['Control Score'] = corpus.get_scaled_f_scores('con')
    pprint(list(term_freq.sort_values(by='Control Score', ascending=False).index[:10]))

In [16]:
print("Experiment 1 indicative words for activity video vs. control conditions:")
print_indicative_terms(e1corpus)
print("Experiment 2 indicative words for activity video vs. control conditions:")
print_indicative_terms(e2corpus)

Experiment 1 indicative words for activity video vs. control conditions:
['give me',
 'me the',
 'give',
 'ribbit ribbit',
 'big',
 'small',
 'ribbit',
 'school bus',
 'the big',
 'put the']
['vroom',
 'shake shake',
 'shake',
 'what ’s',
 'ready',
 '’s this',
 'it ’s',
 'are you',
 'oh',
 'what is']
Experiment 2 indicative words for activity video vs. control conditions:
['you give',
 'big car',
 'small',
 'the little',
 'give',
 'the big',
 'you put',
 'big',
 'cow',
 'put the']
['beep',
 'tap',
 'neigh',
 'is that',
 'like',
 'for',
 'uhoh',
 'say',
 'you like',
 'does']


In [17]:
e1html = st.produce_scattertext_explorer(
    e1corpus,
    category="exp",
    category_name="Activity Video",
    not_category_name="No Video",
    width_in_pixels=1000,
    metadata=e1docs["sid"] # need this? other var? (age?)
)

In [18]:
e2html = st.produce_scattertext_explorer(
    e2corpus,
    category="exp",
    category_name="Activity Video",
    not_category_name="Science Video",
    width_in_pixels=1000,
    metadata=e2docs["sid"]
) 

In [19]:
from IPython.display import IFrame

file_name = "Exp1_relative_frequency.html"

with open(file_name, "wb") as f:
    f.write(e1html.encode("utf-8"))

IFrame(src=file_name, width = 1200, height=700)

In [20]:
from IPython.display import IFrame

file_name = "Exp2_relative_frequency.html"

with open(file_name, "wb") as f:
    f.write(e2html.encode("utf-8"))

IFrame(src=file_name, width = 1200, height=700)

In [21]:
corpus = (e1corpus
          .get_unigram_corpus()
          .compact(st.ClassPercentageCompactor(term_count=2,
                                               term_ranker=st.OncePerDocFrequencyRanker)))
html = st.produce_characteristic_explorer(
	corpus,
	category='exp',
	category_name='Activity Video',
	not_category_name='No Video',
	metadata=corpus.get_df()['sid']
)
open('Exp1_characteristic_chart.html', 'wb').write(html.encode('utf-8'))

450900

In [22]:
corpus = (e2corpus
          .get_unigram_corpus()
          .compact(st.ClassPercentageCompactor(term_count=2,
                                               term_ranker=st.OncePerDocFrequencyRanker)))
html = st.produce_characteristic_explorer(
	corpus,
	category='exp',
	category_name='Activity Video',
	not_category_name='Science Video',
	metadata=corpus.get_df()['sid']
)
open('Exp2_characteristic_chart.html', 'wb').write(html.encode('utf-8'))

514971