In [1]:
import numpy as np
import pandas as pd
import json
import string

In [2]:
london_letter = '''Our  London  business  is  good,  but  Vienna  and  Berlin  are  quiet.   Mr.   D.  Lloyd has gone to Switzerland and I hope for good news.  He will be there for a week at 1496 Zermott Street and then goes to Turin and Rome and will join Colonel Parry and arrive at Athens, Greece, November 27 or December 2.  Letters there should be addressed King James Blvd.  3580.  We expect Charles E. Fuller Tuesday.  Dr.  L. McQuaid and Robert Unger, Esq., left on the ’Y. X.’ Express tonight.'''
london_letter = london_letter.replace('  ',' ')

woz = '''Within a short time she was walking briskly toward the Emerald City, her silver shoes tinkling merrily on the hard, yellow roadbed. The sun shone bright and the birds sang sweet and Dorothy did not feel nearly as bad as you might think a little girl would who had been suddenly whisked away from her own country and set downin the midst of a strange land.'''
woz = woz.replace('  ',' ')

phrase = '''The early bird may get the worm, but the second mouse gets the cheese.'''

In [3]:
def create_word_count_df(text: str):
    clean_words = [i.lower().strip(string.punctuation+'’') for i in text.split(' ')]
    word_df = pd.DataFrame({'word':np.unique(clean_words)})
    word_df['count'] = word_df.word.apply(clean_words.count)
    return word_df

ll_df = create_word_count_df(london_letter)
woz_df = create_word_count_df(woz)
phr_df = create_word_count_df(phrase)
phr_df

Unnamed: 0,word,count
0,bird,1
1,but,1
2,cheese,1
3,early,1
4,get,1
5,gets,1
6,may,1
7,mouse,1
8,second,1
9,the,4


In [4]:
# Defining function to calculate our accuracy score.

def compare_words(results,ref='london'):
    global ll_df, woz_df, phr_df
    valid_refs = {'london','woz','phrase'}
    if ref not in valid_refs:
        raise ValueError(f"results: status must be one of {valid_refs}")
    
    df_dict = {'london':ll_df,'woz':woz_df,'phrase':phr_df}
    
    df = pd.DataFrame({'word':np.unique(results)})
    df['count'] = df.word.apply(results.count)
    df = pd.merge(df_dict[ref],df,on='word',how='left')
    df = df.fillna(0)
    df['difference'] = abs(df.count_x - df.count_y)
    pct_correct = 1-df.difference.sum()/df.count_x.sum()
    return pct_correct

# Defining a helper function to allow us to pull which text is to be analyzed from file names.

def match_texts(name):
    name = name.split('_')[2][1:]
    texts = ['london','woz','phrase']
    if name=='LND':
        return texts[0]
    elif name=='WOZ':
        return texts[1]
    elif name=='PHR':
        return texts[2]

In [5]:
with open('responses1.json') as file1:
    d1 = json.load(file1)

with open('responses2.json') as file2:
    d2 = json.load(file2)

In [6]:
# Function to convert our JSON of responses to a usable dataframe.

def responses_to_df(resp: dict):
    df = pd.DataFrame({'id':[],'score':[],'avg_conf':[],'text':[]})
    for i,j in resp.items():
        confs = [k['Confidence'] for k in j.get('Blocks') if k['BlockType']=='WORD']
        words = [k['Text'].lower().strip(string.punctuation+'’') for k in j.get('Blocks') if k['BlockType']=='WORD']
        next_row = pd.DataFrame({'id':[i],'score':[compare_words(words,match_texts(i))],'avg_conf':[np.mean(confs)],'text':[match_texts(i)]})
        df = df.append(next_row)
    df['wid'] = df.id.apply(lambda x: int(x.split('_')[0][1:]))
    return df

In [7]:
df1 = responses_to_df(d1)
df2 = responses_to_df(d2)
df2.head()

Unnamed: 0,id,score,avg_conf,text,wid
0,w0129_s02_pLND_r02,0.806818,93.23022,london,129
0,w0129_s02_pLND_r03,0.875,94.294199,london,129
0,w0129_s02_pPHR_r01,1.0,97.931404,phrase,129
0,w0129_s02_pPHR_r02,1.0,93.623837,phrase,129
0,w0129_s02_pPHR_r03,1.0,98.466582,phrase,129


In [8]:
# Merge two response dataframes into one.
resp_df = df1.append(df2)
resp_df.head()

Unnamed: 0,id,score,avg_conf,text,wid
0,w0001_s01_pLND_r01,0.875,93.336228,london,1
0,w0001_s01_pLND_r02,0.886364,92.341743,london,1
0,w0001_s01_pLND_r03,0.863636,92.309366,london,1
0,w0001_s01_pPHR_r01,0.928571,97.852685,phrase,1
0,w0001_s01_pPHR_r02,0.928571,94.600888,phrase,1


In [9]:
# Importing participant dataset, given by CSAFE database.
participant_df = pd.read_csv('participant_data.csv')

In [10]:
# Further cleaning measures on participant data for usability.
participant_df['gender'] = participant_df.gender.apply(lambda x: x.lower())
participant_df['hand'] = participant_df.hand.apply(lambda x: x.lower())
participant_df = participant_df[participant_df.s1_time.notna()]
participant_df = participant_df[participant_df.s2_time.notna()]
participant_df = participant_df[participant_df.s3_time.notna()]
participant_df['s1_time'] = participant_df.s1_time.apply(lambda x: x.replace(' ','').lower())
participant_df['s2_time'] = participant_df.s2_time.apply(lambda x: x.replace(' ','').lower())
participant_df['s3_time'] = participant_df.s3_time.apply(lambda x: x.replace(' ','').lower())
participant_df.head()

Unnamed: 0.1,Unnamed: 0,wid,agegroup,gender,hand,thirdgrade_usa,thirdgrade_usa_region,s1_dae,s1_time,s2_dae,s2_time,s3_dae,s3_time
0,0,5,18-24,male,right,True,midwest,0,latemorning,21,earlyafternoon,42,latemorning
1,1,133,25-40,female,right,True,west,0,latemorning,31,earlyafternoon,77,earlymorning
2,2,62,41-60,female,right,True,midwest,0,lateevening,54,latemorning,98,earlyevening
3,3,87,41-60,female,right,True,midwest,0,earlyafternoon,27,latemorning,50,earlymorning
4,4,261,25-40,male,right,True,Midwest,0,earlymorning,27,earlyevening,62,earlyafternoon


In [11]:
# Merging our textract response data with the participant data.
out_df = pd.merge(resp_df,participant_df,on='wid',how='left')
out_df.head()

Unnamed: 0.1,id,score,avg_conf,text,wid,Unnamed: 0,agegroup,gender,hand,thirdgrade_usa,thirdgrade_usa_region,s1_dae,s1_time,s2_dae,s2_time,s3_dae,s3_time
0,w0001_s01_pLND_r01,0.875,93.336228,london,1,67.0,25-40,female,right,True,midwest,0.0,earlyafternoon,26.0,latemorning,47.0,earlyafternoon
1,w0001_s01_pLND_r02,0.886364,92.341743,london,1,67.0,25-40,female,right,True,midwest,0.0,earlyafternoon,26.0,latemorning,47.0,earlyafternoon
2,w0001_s01_pLND_r03,0.863636,92.309366,london,1,67.0,25-40,female,right,True,midwest,0.0,earlyafternoon,26.0,latemorning,47.0,earlyafternoon
3,w0001_s01_pPHR_r01,0.928571,97.852685,phrase,1,67.0,25-40,female,right,True,midwest,0.0,earlyafternoon,26.0,latemorning,47.0,earlyafternoon
4,w0001_s01_pPHR_r02,0.928571,94.600888,phrase,1,67.0,25-40,female,right,True,midwest,0.0,earlyafternoon,26.0,latemorning,47.0,earlyafternoon


In [12]:
# Removing a mislabeled and unusable participant entry and a useless column.
out_df = out_df[out_df.wid!=203]
out_df = out_df.drop(['Unnamed: 0'],axis=1)

out_df.to_csv('textract_dataframe.csv')