In [0]:
import pandas as pd
import numpy as np
# NOTE: Quality control and aggregation are done concurrently in the code. Quality control and aggregation sections will be labeled with comments.

In [0]:
# Quality Control and Aggregation Part 1 English
def english_urls_1(english_pass_1, english_pass_1_qual):
    #Quality Control finding workers who passed qualification test and didn't pass the test
    good_workers = {}
    bad_workers = {}
    for index, row in english_pass_1_qual.iterrows():
        if row['Answer.1. Who is ______ Marina or Sachiko?'] == 3 and row['Answer.2. Each of the Olympic athletes ______ for months even years.'] == 3 and row['Answer.3. The hurricane caused ______ damage to the city.'] == 3 and row['Answer.4. Many cultures have special ceremonies to celebrate a person\'s ______ of passage into adulthood.'] == 2:
            good_workers[row['WorkerId']] = [row['Answer.question1'], row['Answer.question2'], row['Answer.question3'], row['Answer.question4']]
        else:
            bad_workers[row['WorkerId']] = [row['Answer.question1'], row['Answer.question2'], row['Answer.question3'], row['Answer.question4']]
    
    #Aggregation building a dataframe with following columns:
    #worker id, language, word, vocaroo url, empty pronounciations tuple to be populated in pass 3
    good_tuples = []
    bad_tuples = []
    for index, row in english_pass_1.iterrows():
        if row['WorkerId'] in good_workers:
            demographics = good_workers.get(row['WorkerId'])
            for i in range(1, 11):
                good_tuples.append((row['WorkerId'], "English", row['Input.text' + str(i)], row['Answer.audioRecording' + str(i)], demographics[0], demographics[1], demographics[2], demographics[3], ()))
        elif row['WorkerId'] in bad_workers:
            demographics = bad_workers.get(row['WorkerId'])
            for i in range(1, 11):
                bad_tuples.append((row['WorkerId'], "English", row['Input.text' + str(i)], row['Answer.audioRecording' + str(i)], demographics[0], demographics[1], demographics[2], demographics[3], ()))
    
    good_df = pd.DataFrame(good_tuples, columns=('WorkerId', 'Language', 'Word', 'URL', 'Gender', 'Native Language', 'Years Spoken', 'Number of Languages Spoken', 'Pronunciations'))
    bad_df = pd.DataFrame(bad_tuples, columns=('WorkerId', 'Language', 'Word', 'URL', 'Gender', 'Native Language', 'Years Spoken', 'Number of Languages Spoken', 'Pronunciations'))
    return [good_df, bad_df]
  

In [0]:
english_pass_1 = pd.read_csv('Pass_1_Eng Results.csv')
english_pass_1_qual = pd.read_csv('Eng Qual Results.csv')
dfs = english_urls_1(english_pass_1, english_pass_1_qual)
good_english = dfs[0]
bad_english = dfs[1]

good_english = good_english.dropna(how='any',axis=0)
bad_english = bad_english.dropna(how='any',axis=0)
print(bad_english)
#english_df.to_csv('english_results.csv')

           WorkerId Language      Word                                 URL  \
0    A25EY8B3SXJ18J  English       the  https://vocaroo.com/i/s0mu1jgnbI2o   
1    A25EY8B3SXJ18J  English      find  https://vocaroo.com/i/s0z9a9P3sQnW   
2    A25EY8B3SXJ18J  English     earth  https://vocaroo.com/i/s1jkXNeZe4el   
3    A25EY8B3SXJ18J  English      face  https://vocaroo.com/i/s0oUNgTah6yr   
4    A25EY8B3SXJ18J  English      pull  https://vocaroo.com/i/s0n9tBSqwSdS   
5    A25EY8B3SXJ18J  English      fill  https://vocaroo.com/i/s1LP1Ppjh9K9   
6    A25EY8B3SXJ18J  English      hope  https://vocaroo.com/i/s0FOJSSlBE9r   
7    A25EY8B3SXJ18J  English      flat  https://vocaroo.com/i/s081fiES9f6i   
8    A25EY8B3SXJ18J  English     chair  https://vocaroo.com/i/s1fRhPXOOkon   
9    A25EY8B3SXJ18J  English   require  https://vocaroo.com/i/s0tUhUJzelAG   
10    ACNYYHTK64MJY  English        of  https://vocaroo.com/i/s1lRPe8x1Ody   
11    ACNYYHTK64MJY  English       any  https://vocaroo.com/i/s0

In [0]:
# Quality Control and Aggregation Part 2 English
def english_urls_2(pass2, good_english, bad_english):
    #Quality Contol, giving each URL a score. Score is incremented by 1 when it is labeled quality and decremented by 1 when it is labeled bad quality
    good_urls = good_english['URL'].tolist()
    d = dict.fromkeys(good_urls, 0)
    for index, row in pass2.iterrows():
        if row['Answer.question0.1']:
            for i in range(1, 11):
                if row['Input.url' + str(i)] in d:
                    if row['Answer.question' + str(i) + '.1']:
                        d[row['Input.url' + str(i)]] += 1
                    else:
                        d[row['Input.url' + str(i)]] -= 1
                        
    bad_urls = bad_english['URL'].tolist()
    d1 = dict.fromkeys(bad_urls, 0)
    for index, row in pass2.iterrows():
        if row['Answer.question0.1']:
            for i in range(1, 11):
                if row['Input.url' + str(i)] in d1:
                    if row['Answer.question' + str(i) + '.1']:
                        d1[row['Input.url' + str(i)]] += 1
                    else:
                        d1[row['Input.url' + str(i)]] -= 1
    
    #Aggregation, removing all rows from the origanal dataframes that have URL scores less than 0
    for index, row in good_english.iterrows():
        if d[row['URL']] < 0:
            good_english = good_english.drop(index)
            
    for index, row in bad_english.iterrows():
        if d1[row['URL']] < 0:
            bad_english = bad_english.drop(index)
            
    
    return [good_english, bad_english]

In [0]:
pass2 = pd.read_csv('Pass_2_Eng Results.csv')
dfs_2 = english_urls_2(pass2, good_english, bad_english)

good_english2 = dfs_2[0]
bad_english2 = dfs_2[1]
good_english2.to_csv('good_english_results2.csv')
bad_english2.to_csv('bad_english_results2.csv')

In [0]:
# Quality Control and Aggregation Part 3 English
def english_pronunciations(pass3):
    for index, row in pass3.iterrows():
        # Quality control check: does the user type the correct pronounciation of taxi
        if row['Answer.transcription0'].lower() == 'taxi':
            #Aggregation: aggregating pronounciations into correct rows in the dataframe
            for i in range(1, 11):
                url = row['Input.url' + str(i)]
                if url in good_english2.URL.values:
                    curr_pronunciations = good_english2.loc[good_english2['URL'] == url, 'Pronunciations']
                    new = curr_pronunciations + (str(row['Answer.transcription' + str(i)]).lower(),)
                    good_english2.loc[good_english2['URL'] == url, 'Pronunciations'] = new
                elif url in bad_english2.URL.values:
                    curr_pronunciations = bad_english2.loc[bad_english2['URL'] == url, 'Pronunciations']
                    new = curr_pronunciations + (str(row['Answer.transcription' + str(i)]).lower(),)
                    bad_english2.loc[bad_english2['URL'] == url, 'Pronunciations'] = new


    return [good_english2, bad_english2]


In [0]:
pass3 = pd.read_csv('Pass_3_Eng Results.csv')
final_dfs = english_pronunciations(pass3)
final_english_good = final_dfs[0]
final_english_bad = final_dfs[1]

final_english_good.to_csv('english_final_good.csv')
final_english_bad.to_csv('english_final_bad.csv')

In [0]:
#Creates a chart of words and their mispronunciations

words = final_english_bad['Word']
pronunciations = final_english_bad['Pronunciations']

mispronunciations = []

#Create a list of the mispronunciations
for index, value in pronunciations.iteritems():
    l = list(value)
    l = list(dict.fromkeys(l))
    if words[index] in l:
        l.remove(words[index])
    
    mispronunciations.append(l)
    
t = []

#Construct a dataframe of words and mispronunciations
for i in range(len(mispronunciations)):
    t.append((words.iloc[i], mispronunciations[i]))
    
df = pd.DataFrame(t, columns=('Word', 'Mispronunciations'))
df = df[df['Mispronunciations'].map(lambda d: len(d)) > 0]
df.to_csv('english_final_bad_mispronunciations.csv')

    