In [8]:
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from nltk.tokenize import RegexpTokenizer
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_path = 'data/megahr_ci_values.txt'
ci_table = pd.DataFrame()
words = []
concrete_vals = []
img_vals = []
with open(data_path) as f:
    lines = f.readlines()

for line in lines:
    line_data = line.split("\t")
    words.append(line_data[0])
    concrete_vals.append(line_data[1])
    img_vals.append(line_data[2][:-2])

In [5]:
ci_table['word'] = words
ci_table['concreteness'] = concrete_vals
ci_table['imagery'] = img_vals
ci_table['concreteness'] = ci_table['concreteness'].apply(float)
ci_table['imagery'] = ci_table['imagery'].apply(float)
ci_table

Unnamed: 0,word,concreteness,imagery
0,the,3.031515,3.535658
1,of,2.571804,3.234478
2,in,3.426647,3.808854
3,and,3.163961,3.638683
4,to,3.087257,3.622661
...,...,...,...
99995,arilang,3.606587,3.878786
99996,suzerain,3.010527,3.510428
99997,wickramasinghe,3.587131,3.870119
99998,westerburg,4.032678,4.412366


In [7]:
max(ci_table['concreteness']), min(ci_table['concreteness']), max(ci_table['imagery']), min(ci_table['imagery'])


(5.35358372947499, 0.8711022232441197, 5.261935844109423, 1.779995109692767)

In [23]:
def _clean_text(text):
        encoded_text = text.encode("ascii", "ignore")
        text = encoded_text.decode()
        text = text.replace('…', ' ')
        text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
        text = re.sub(' +', ' ', text)
        text = re.sub('’', '', text)
        text = re.sub(r'\n+', '\n', text).strip()
        text = text.lower()
        return text

tokenizer = RegexpTokenizer(r'\w+')

def find_words(text):
    return tokenizer.tokenize(text)

def remove_stu_tea(text):
    text = text.replace('teacher: ', '')
    text = text.replace('student: ', '')
    return text

def get_avg_value(word_list, concreteness_imagery):
    avg_sum = 0
    max_val = 0
    if len(word_list)<1:
        return 0
    for word in word_list:
        if (word =='teacher') | (word == 'student'):
            c_val = 0
        elif word in ci_table['word'].values:
            c_val = ci_table[ci_table['word'] ==word][concreteness_imagery].values[0]
        else:
            c_val = 0
        if max_val<c_val:
            max_val = c_val
        avg_sum+=c_val

    return avg_sum/len(word_list)


In [21]:
conv_data = pd.read_excel('data/conversations_for_llama_6656.xlsx')
conv_data['proc_text'] = conv_data['proc_text'].apply(remove_stu_tea).apply(_clean_text)
conv_data['conv_words'] = conv_data['proc_text'].apply(find_words)
conv_data['tot_conv_words'] = conv_data['proc_text'].apply(_clean_text).apply(find_words)
conv_data['conv_len'] = conv_data['conv_words'].apply(len)
conv_data

Unnamed: 0,proc_text,conversation_id,id,int_mean,int_var,conv_words,tot_conv_words,conv_len
0,"hi <teacher>, how are you? yeah i'm good thank...",7,53285,3.000000,1.000000,"[hi, teacher, how, are, you, yeah, i, m, good,...","[hi, teacher, how, are, you, yeah, i, m, good,...",27
1,yeah well my daughter started doing it and has...,7,45947,1.666667,0.333333,"[yeah, well, my, daughter, started, doing, it,...","[yeah, well, my, daughter, started, doing, it,...",49
2,yeah? in the uk too i'd say....there are lots ...,7,45955,1.666667,0.333333,"[yeah, in, the, uk, too, i, d, say, there, are...","[yeah, in, the, uk, too, i, d, say, there, are...",59
3,yeah? what did you do? i'll come back on the h...,7,45941,2.000000,0.000000,"[yeah, what, did, you, do, i, ll, come, back, ...","[yeah, what, did, you, do, i, ll, come, back, ...",114
4,great but not 'as a team' just for yourself! h...,7,53297,2.000000,4.000000,"[great, but, not, as, a, team, just, for, your...","[great, but, not, as, a, team, just, for, your...",21
...,...,...,...,...,...,...,...,...
6651,"but then, if you have some exercises or intere...",258,38329,2.333333,1.333333,"[but, then, if, you, have, some, exercises, or...","[but, then, if, you, have, some, exercises, or...",101
6652,my guess is that in italian 'borrow' and 'lend...,258,56381,1.666667,1.333333,"[my, guess, is, that, in, italian, borrow, and...","[my, guess, is, that, in, italian, borrow, and...",67
6653,nearly! had ___ up in another time they are in...,258,56382,1.666667,1.333333,"[nearly, had, ___, up, in, another, time, they...","[nearly, had, ___, up, in, another, time, they...",34
6654,maybe i should start by writing a topic senten...,258,38323,2.333333,2.333333,"[maybe, i, should, start, by, writing, a, topi...","[maybe, i, should, start, by, writing, a, topi...",72


In [27]:
conv_data_nodup = conv_data.drop_duplicates(subset='proc_text')
conv_data_nodup

Unnamed: 0,proc_text,conversation_id,id,int_mean,int_var,conv_words,tot_conv_words,conv_len
0,"hi <teacher>, how are you? yeah i'm good thank...",7,53285,3.000000,1.000000,"[hi, teacher, how, are, you, yeah, i, m, good,...","[hi, teacher, how, are, you, yeah, i, m, good,...",27
1,yeah well my daughter started doing it and has...,7,45947,1.666667,0.333333,"[yeah, well, my, daughter, started, doing, it,...","[yeah, well, my, daughter, started, doing, it,...",49
2,yeah? in the uk too i'd say....there are lots ...,7,45955,1.666667,0.333333,"[yeah, in, the, uk, too, i, d, say, there, are...","[yeah, in, the, uk, too, i, d, say, there, are...",59
3,yeah? what did you do? i'll come back on the h...,7,45941,2.000000,0.000000,"[yeah, what, did, you, do, i, ll, come, back, ...","[yeah, what, did, you, do, i, ll, come, back, ...",114
4,great but not 'as a team' just for yourself! h...,7,53297,2.000000,4.000000,"[great, but, not, as, a, team, just, for, your...","[great, but, not, as, a, team, just, for, your...",21
...,...,...,...,...,...,...,...,...
6651,"but then, if you have some exercises or intere...",258,38329,2.333333,1.333333,"[but, then, if, you, have, some, exercises, or...","[but, then, if, you, have, some, exercises, or...",101
6652,my guess is that in italian 'borrow' and 'lend...,258,56381,1.666667,1.333333,"[my, guess, is, that, in, italian, borrow, and...","[my, guess, is, that, in, italian, borrow, and...",67
6653,nearly! had ___ up in another time they are in...,258,56382,1.666667,1.333333,"[nearly, had, ___, up, in, another, time, they...","[nearly, had, ___, up, in, another, time, they...",34
6654,maybe i should start by writing a topic senten...,258,38323,2.333333,2.333333,"[maybe, i, should, start, by, writing, a, topi...","[maybe, i, should, start, by, writing, a, topi...",72


In [29]:
conv_data_nodup["concreteness_tot_conv"] = conv_data_nodup['tot_conv_words'].apply(lambda x: get_avg_value(x, 'concreteness'))

In [30]:
conv_data_nodup["imagery_tot_conv"] = conv_data_nodup['tot_conv_words'].apply(lambda x: get_avg_value(x, 'imagery'))

In [35]:
conv_data = conv_data.merge(conv_data_nodup[['id','conversation_id','concreteness_tot_conv', 'imagery_tot_conv']], on=['id','conversation_id'], how='left')
conv_data

Unnamed: 0,proc_text,conversation_id,id,int_mean,int_var,conv_words,tot_conv_words,conv_len,concreteness_tot_conv,imagery_tot_conv
0,"hi <teacher>, how are you? yeah i'm good thank...",7,53285,3.000000,1.000000,"[hi, teacher, how, are, you, yeah, i, m, good,...","[hi, teacher, how, are, you, yeah, i, m, good,...",27,2.221969,2.572610
1,yeah well my daughter started doing it and has...,7,45947,1.666667,0.333333,"[yeah, well, my, daughter, started, doing, it,...","[yeah, well, my, daughter, started, doing, it,...",49,2.688817,3.050364
2,yeah? in the uk too i'd say....there are lots ...,7,45955,1.666667,0.333333,"[yeah, in, the, uk, too, i, d, say, there, are...","[yeah, in, the, uk, too, i, d, say, there, are...",59,2.708010,3.030274
3,yeah? what did you do? i'll come back on the h...,7,45941,2.000000,0.000000,"[yeah, what, did, you, do, i, ll, come, back, ...","[yeah, what, did, you, do, i, ll, come, back, ...",114,2.585220,2.953317
4,great but not 'as a team' just for yourself! h...,7,53297,2.000000,4.000000,"[great, but, not, as, a, team, just, for, your...","[great, but, not, as, a, team, just, for, your...",21,2.627355,3.090339
...,...,...,...,...,...,...,...,...,...,...
6651,"but then, if you have some exercises or intere...",258,38329,2.333333,1.333333,"[but, then, if, you, have, some, exercises, or...","[but, then, if, you, have, some, exercises, or...",101,2.775614,3.159811
6652,my guess is that in italian 'borrow' and 'lend...,258,56381,1.666667,1.333333,"[my, guess, is, that, in, italian, borrow, and...","[my, guess, is, that, in, italian, borrow, and...",67,3.106379,3.504408
6653,nearly! had ___ up in another time they are in...,258,56382,1.666667,1.333333,"[nearly, had, ___, up, in, another, time, they...","[nearly, had, ___, up, in, another, time, they...",34,2.855302,3.253492
6654,maybe i should start by writing a topic senten...,258,38323,2.333333,2.333333,"[maybe, i, should, start, by, writing, a, topi...","[maybe, i, should, start, by, writing, a, topi...",72,2.445217,2.788987


In [37]:
conv_data.to_excel('data/concreteness_imagery_computed.xlsx', index=False)