In [0]:
import pandas as pd
import numpy as np
# NOTE: Quality control and aggregation are done concurrently in the code. Quality control and aggregation sections will be labeled with comments.

In [0]:
# Quality Control and Aggregation Part 1 Spanish
def spanish_urls_1(spanish_pass_1, spanish_pass_1_qual):
    #Quality Control finding workers who passed qualification test and didn't pass the test
    good_workers = {}
    bad_workers = {}
    for index, row in spanish_pass_1_qual.iterrows():
        if row['Answer.1. Era probable que él lo __________.'] == 4 and row['Answer.2. Todas mis amigas __________ a la fiesta por Marcos.'] == 1 and row['Answer.3. Es una broma: yo te estoy tomando ___________.'] == 1 and row['Answer.4. No entiendo la tarea así que le _________ una pregunta al profesor.'] == 3:
            good_workers[row['WorkerId']] = [row['Answer.question1'], row['Answer.question2'], row['Answer.question3'], row['Answer.question4']]
        else:
            bad_workers[row['WorkerId']] = [row['Answer.question1'], row['Answer.question2'], row['Answer.question3'], row['Answer.question4']]
    
    #Aggregation building a dataframe with following columns:
    #worker id, language, word, vocaroo url, empty pronounciations tuple to be populated in pass 3
    good_tuples = []
    bad_tuples = []
    for index, row in spanish_pass_1.iterrows():
        if row['WorkerId'] in good_workers:
            demographics = good_workers.get(row['WorkerId'])
            for i in range(1, 11):
                good_tuples.append((row['WorkerId'], "Spanish", row['Input.text' + str(i)], row['Answer.audioRecording' + str(i)], demographics[0], demographics[1], demographics[2], demographics[3], ()))
        elif row['WorkerId'] in bad_workers:
            demographics = bad_workers.get(row['WorkerId'])
            for i in range(1, 11):
                bad_tuples.append((row['WorkerId'], "Spanish", row['Input.text' + str(i)], row['Answer.audioRecording' + str(i)], demographics[0], demographics[1], demographics[2], demographics[3], ()))
    
    good_df = pd.DataFrame(good_tuples, columns=('WorkerId', 'Language', 'Word', 'URL', 'Gender', 'Native Language', 'Years Spoken', 'Number of Languages Spoken', 'Pronunciations'))
    bad_df = pd.DataFrame(bad_tuples, columns=('WorkerId', 'Language', 'Word', 'URL', 'Gender', 'Native Language', 'Years Spoken', 'Number of Languages Spoken', 'Pronunciations'))
    return [good_df, bad_df]
  

In [35]:
spanish_pass_1 = pd.read_csv('Pass_1_Span Results.csv')
spanish_pass_1_qual = pd.read_csv('Span Qual Results.csv')
dfs = spanish_urls_1(spanish_pass_1, spanish_pass_1_qual)
good_spanish = dfs[0]
bad_spanish = dfs[1]

good_spanish = good_spanish.dropna(how='any',axis=0)
bad_spanish = bad_spanish.dropna(how='any',axis=0)
print(bad_spanish)
#spanish_df.to_csv('spanish_results.csv')

           WorkerId Language           Word  \
0     AAFAR49RN5TS7  Spanish             la   
1     AAFAR49RN5TS7  Spanish            dos   
2     AAFAR49RN5TS7  Spanish            esa   
3     AAFAR49RN5TS7  Spanish           nada   
4     AAFAR49RN5TS7  Spanish           unos   
5     AAFAR49RN5TS7  Spanish  internacional   
6     AAFAR49RN5TS7  Spanish        pública   
7     AAFAR49RN5TS7  Spanish        segunda   
8     AAFAR49RN5TS7  Spanish      encuentro   
9     AAFAR49RN5TS7  Spanish         acceso   
10    AAFAR49RN5TS7  Spanish              y   
11    AAFAR49RN5TS7  Spanish          parte   
12    AAFAR49RN5TS7  Spanish          mundo   
13    AAFAR49RN5TS7  Spanish         pueden   
14    AAFAR49RN5TS7  Spanish          estar   
15    AAFAR49RN5TS7  Spanish          niños   
16    AAFAR49RN5TS7  Spanish       atención   
17    AAFAR49RN5TS7  Spanish           paso   
18    AAFAR49RN5TS7  Spanish         actual   
19    AAFAR49RN5TS7  Spanish           alto   
20    AAFAR49

In [0]:
# Quality Control and Aggregation Part 2 Spanish
def spanish_urls_2(pass2, good_spanish, bad_spanish):
    #Quality Contol, giving each URL a score. Score is incremented by 1 when it is labeled quality and decremented by 1 when it is labeled bad quality
    good_urls = good_spanish['URL'].tolist()
    d = dict.fromkeys(good_urls, 0)
    for index, row in pass2.iterrows():
        if row['Answer.question0.1']:
            for i in range(1, 11):
                if row['Input.url' + str(i)] in d:
                    if row['Answer.question' + str(i) + '.1']:
                        d[row['Input.url' + str(i)]] += 1
                    else:
                        d[row['Input.url' + str(i)]] -= 1
                        
    bad_urls = bad_spanish['URL'].tolist()
    d1 = dict.fromkeys(bad_urls, 0)
    for index, row in pass2.iterrows():
        if row['Answer.question0.1']:
            for i in range(1, 11):
                if row['Input.url' + str(i)] in d1:
                    if row['Answer.question' + str(i) + '.1']:
                        d1[row['Input.url' + str(i)]] += 1
                    else:
                        d1[row['Input.url' + str(i)]] -= 1
    
    #Aggregation, removing all rows from the origanal dataframes that have URL scores less than 0
    for index, row in good_spanish.iterrows():
        if d[row['URL']] < 0:
            good_spanish = good_spanish.drop(index)
            
    for index, row in bad_spanish.iterrows():
        if d1[row['URL']] < 0:
            bad_spanish = bad_spanish.drop(index)
            
    
    return [good_spanish, bad_spanish]

In [0]:
pass2 = pd.read_csv('Pass_2_Span Results.csv')
dfs_2 = spanish_urls_2(pass2, good_spanish, bad_spanish)

good_spanish2 = dfs_2[0]
bad_spanish2 = dfs_2[1]
good_spanish2.to_csv('good_spanish_results2.csv')
bad_spanish2.to_csv('bad_spanish_results2.csv')

In [0]:
# Quality Control and Aggregation Part 3 Spanish
def spanish_pronunciations(pass3):
    for index, row in pass3.iterrows():
        # Quality control check: does the user type the correct pronounciation of taxi
        if row['Answer.transcription0'].lower() == 'gracias':
            #Aggregation: aggregating pronounciations into correct rows in the dataframe
            for i in range(1, 11):
                url = row['Input.url' + str(i)]
                if url in good_spanish2.URL.values:
                    curr_pronunciations = good_spanish2.loc[good_spanish2['URL'] == url, 'Pronunciations']
                    new = curr_pronunciations + (str(row['Answer.transcription' + str(i)]).lower(),)
                    good_spanish2.loc[good_spanish2['URL'] == url, 'Pronunciations'] = new
                elif url in bad_spanish2.URL.values:
                    curr_pronunciations = bad_spanish2.loc[bad_spanish2['URL'] == url, 'Pronunciations']
                    new = curr_pronunciations + (str(row['Answer.transcription' + str(i)]).lower(),)
                    bad_spanish2.loc[bad_spanish2['URL'] == url, 'Pronunciations'] = new


    return [good_spanish2, bad_spanish2]


In [0]:
pass3 = pd.read_csv('Pass_3_Span Results.csv')
final_dfs = spanish_pronunciations(pass3)
final_spanish_good = final_dfs[0]
final_spanish_bad = final_dfs[1]

final_spanish_good.to_csv('spanish_final_good.csv')
final_spanish_bad.to_csv('spanish_final_bad.csv')

In [0]:
#Creates a chart of words and their mispronunciations

words = final_spanish_good['Word']
pronunciations = final_spanish_good['Pronunciations']

mispronunciations = []

#Create a list of the mispronunciations
for index, value in pronunciations.iteritems():
    l = list(value)
    l = list(dict.fromkeys(l))
    if words[index] in l:
        l.remove(words[index])
    
    mispronunciations.append(l)
    
t = []

#Construct a dataframe of words and mispronunciations
for i in range(len(mispronunciations)):
    t.append((words.iloc[i], mispronunciations[i]))
    
df = pd.DataFrame(t, columns=('Word', 'Mispronunciations'))
df = df[df['Mispronunciations'].map(lambda d: len(d)) > 0]
df.to_csv('spanish_final_good_mispronunciations.csv')

    