In [0]:
import pandas as pd
import numpy as np
# NOTE: Quality control and aggregation are done concurrently in the code. Quality control and aggregation sections will be labeled with comments.

In [0]:
# Quality Control and Aggregation Part 1 English
def english_urls_1(english_pass_1, english_pass_1_qual):
    #Quality Control finding workers who passed qualification test
    good_workers = []
    for index, row in english_pass_1_qual.iterrows():
        if row['Answer.1. Who is ______ Marina or Sachiko?'] == 3 and row['Answer.2. Each of the Olympic athletes ______ for months even years.'] == 3 and row['Answer.3. The hurricane caused ______ damage to the city.'] == 3 and row['Answer.4. Many cultures have special ceremonies to celebrate a person\'s ______ of passage into adulthood.'] == 2:
            good_workers.append(row['WorkerId'])
    
    #Aggregation building a dataframe with following columns:
    #worker id, language, Male e.g. male = true female = false, word, vocaroo url, empty pronounciations tuple to be populated in pass 3
    tuples = []
    for index, row in english_pass_1.iterrows():
        if row['WorkerId'] in good_workers:
            for i in range(1, 11):
                tuples.append((row['WorkerId'], "English", row['Answer.question1.1'], row['Input.text' + str(i)], row['Answer.audioRecording' + str(i)], ()))
    
    df = pd.DataFrame(tuples, columns=('WorkerId', 'Language', 'Male', 'Word', 'URL', 'Pronunciations'))
    return df
  

In [0]:
english_pass_1 = pd.read_csv('eng_pass_1.csv')
english_pass_1_qual = pd.read_csv('eng_qual_results.csv')
english_df = english_urls_1(english_pass_1, english_pass_1_qual)

english_df = english_df.dropna(how='any',axis=0) 
english_df.to_csv('english_results.csv')

In [0]:
# Quality Control and Aggregation Part 2 English
def english_urls_2(pass2):
    #Quality Contol, giving each URL a score. Score is incremented by 1 when it is labeled quality and decremented by 1 when it is labeled bad quality
    urls = english_df['URL'].tolist()
    d = dict.fromkeys(urls, 0)
    for index, row in pass2.iterrows():
        if row['Answer.question0.1']:
            for i in range(1, 11):
                if row['Input.url' + str(i)] in d:
                    if row['Answer.question' + str(i) + '.1']:
                        d[row['Input.url' + str(i)]] += 1
                    else:
                        d[row['Input.url' + str(i)]] -= 1
    
    #Aggregation, removing all rows from the origanal dataframe that have URL scores less than 0
    for index, row in english_df.iterrows():
        if d[row['URL']] < 0:
            english_df.drop(index, inplace = True)
    
    return english_df

In [0]:
pass2 = pd.read_csv('pass_2.csv')
english_2 = english_urls_2(pass2)
english_2.to_csv('english_results2.csv')

In [0]:
# Quality Control and Aggregation Part 3 English
def english_pronunciations(pass3):
    for index, row in pass3.iterrows():
        # Quality control check: does the user type the correct pronounciation of taxi
        if row['Answer.transcription0'].lower() == 'taxi':
            #Aggregation: aggregating pronounciations into correct rows in the dataframe
            for i in range(1, 11):
                url = row['Input.url' + str(i)]
                curr_pronunciations = english_2.loc[english_2['URL'] == url, 'Pronunciations']
                #print(curr_pr)
                new = curr_pronunciations + (row['Answer.transcription' + str(i)].lower(),)
                
                english_2.loc[english_2['URL'] == url, 'Pronunciations'] = new


    return english_2


In [0]:
pass3 = pd.read_csv('pass_3.csv')
final_df = english_pronunciations(pass3)
final_df.to_csv('english_final_results.csv')

In [0]:
#Creates a chart of words and their mispronunciations

words = final_df['Word']
pronunciations = final_df['Pronunciations']

mispronunciations = []

#Create a list of the mispronunciations
for index, value in pronunciations.iteritems():
    l = list(value)
    l = list(dict.fromkeys(l))
    if words[index] in l:
        l.remove(words[index])
    
    mispronunciations.append(l)
    
t = []

#Construct a dataframe of words and mispronunciations
for i in range(len(mispronunciations)):
    t.append((words.iloc[i], mispronunciations[i]))
    
df = pd.DataFrame(t, columns=('Word', 'Mispronunciations'))
df = df[df['Mispronunciations'].map(lambda d: len(d)) > 0]
df.to_csv('chart.csv')

    