In [None]:
# !pip install fuzzywuzzy

In [None]:
import pandas as pd
import re

from fuzzywuzzy import fuzz



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Labels pulled from the Common Voice Cr 20.0 tables.

# The age and gender labels are pre-categorized and do not require further processing.
age_labels = ['teens', 'twenties', 'thirties', 'fourties', 'fifties', 'sixties', 'seventies']
gender_labels = ['male_masculine', 'female_feminine', 'non-binary', 'transgender', 'do_not_wish_to_say']

# The accent labels were provided by the user and require further processing.
common_voice_accents = pd.read_csv('/content/drive/My Drive/MIDS/mp3_count_by_accent_for_common_voice.csv')
common_voice_accents.head()

Unnamed: 0,Accent,MP3_Count
0,United States English,435673
1,England English,150300
2,"India and South Asia (India, Pakistan, Sri Lanka)",108836
3,Canadian English,74732
4,Australian English,54827


In [None]:
# For details on accent categories, refer to the document: https://docs.google.com/document/d/1i8AIa1-pm2HnQSn2aV_kiYyUUeWM9dSXJb9zYLGRqRM/edit?usp=sharing.
ACCENT_CATEGORIES = [
    'British Isles',
    'North America',
    'Oceania',
    'Sub-Saharan Africa',
    'South Africa and Southern Africa',
    'South Asia',
    'Southeast Asia',
    'East Asia',
    'Middle East and Central Asia',
    'Caribbean',
    'Latin America',
    'Eastern Europe',
    'Western Europe',
    'Scandinavia and Northern Europe',
    'Mediterranean'
]

category_keywords = {
    'British Isles': [
        'United Kingdom', 'England', 'Scotland', 'Wales', 'Northern Ireland', 'Ireland'
    ],
    'North America': [
        'United States', 'Canada'
    ],
    'Oceania': [
        'Australia', 'New Zealand', 'Papua New Guinea', 'Fiji', 'Samoa', 'Tonga', 'Vanuatu',
        'Solomon Islands', 'Kiribati', 'Tuvalu', 'Nauru', 'Marshall Islands', 'Palau', 'Micronesia',
        'Cook Islands', 'Niue', 'American Samoa', 'French Polynesia', 'New Caledonia', 'Wallis and Futuna'
    ],
    'Sub-Saharan Africa': [
        'Ghana', 'Nigeria', 'Kenya', 'Uganda', 'Tanzania', 'Zambia',
        'Malawi', 'Angola', 'Mozambique', 'Seychelles', 'Mauritius', 'Côte d\'Ivoire', 'Senegal', 'Cameroon', 'Mali',
        'Burkina Faso', 'Niger', 'Rwanda', 'Burundi', 'Chad', 'Central African Republic', 'Democratic Republic of the Congo',
        'Republic of the Congo', 'Togo', 'Benin', 'Comoros', 'Eritrea', 'South Sudan', 'Botswana', 'Gabon'
    ],
    'Southern Africa': [
        'South Africa', 'Namibia', 'Zimbabwe', 'Lesotho'
    ],
    'South Asia': [
        'India', 'Pakistan', 'Sri Lanka', 'Bangladesh', 'Nepal', 'Maldives', 'Bhutan'
    ],
    'Southeast Asia': [
        'Philippines', 'Indonesia', 'Thailand', 'Vietnam', 'Malaysia', 'Singapore', 'Myanmar',
        'Cambodia', 'Brunei', 'Laos', 'Timor-Leste'
    ],
    'East Asia': [
        'China', 'Japan', 'South Korea', 'North Korea', 'Mongolia', 'Taiwan', 'Hong Kong', 'Macau'
    ],
    'Middle East and Central Asia': [
        'Turkey', 'Iran', 'Iraq', 'Saudi Arabia', 'Syria', 'Jordan', 'Lebanon', 'Israel', 'Palestine',
        'Egypt', 'Kuwait', 'Bahrain', 'Qatar', 'United Arab Emirates', 'Oman', 'Yemen', 'Afghanistan',
        'Uzbekistan', 'Kazakhstan', 'Kyrgyzstan', 'Turkmenistan', 'Tajikistan', 'Azerbaijan', 'Armenia'
    ],
    'Caribbean': [
        'Jamaica', 'Trinidad and Tobago', 'Bahamas', 'Barbados', 'Haiti', 'Cuba', 'Dominican Republic',
        'Puerto Rico', 'Saint Lucia', 'Antigua and Barbuda', 'Saint Kitts and Nevis', 'Grenada',
        'Saint Vincent and the Grenadines', 'Belize', 'Bermuda', 'Cayman Islands'
    ],
    'Latin America': [
        'Argentina', 'Brazil', 'Colombia', 'Chile', 'Peru', 'Venezuela', 'Ecuador',
        'Bolivia', 'Paraguay', 'Uruguay', 'Guatemala', 'Honduras', 'El Salvador', 'Nicaragua',
        'Costa Rica', 'Panama', 'Mexico'
    ],
    'Eastern Europe': [
        'Russia', 'Poland', 'Ukraine', 'Romania', 'Bulgaria', 'Serbia', 'Croatia', 'Slovenia', 'Moldova',
        'Slovakia', 'Czech Republic', 'Hungary', 'Albania', 'Bosnia and Herzegovina', 'North Macedonia',
        'Belarus', 'Georgia', 'Armenia', 'Kosovo'
    ],
    'Western Europe': [
        'France', 'Germany', 'Italy', 'Portugal', 'Netherlands', 'Belgium', 'Luxembourg',
        'Switzerland', 'Monaco', 'Austria', 'Liechtenstein', 'San Marino', 'Andorra', 'Spain'
    ],
    'Scandinavia and Northern Europe': [
        'Norway', 'Sweden', 'Finland', 'Denmark', 'Iceland', 'Estonia', 'Latvia', 'Lithuania'
    ],
    'Mediterranean': [
        'Greece', 'Turkey', 'Cyprus', 'Malta', 'Tunisia', 'Algeria', 'Morocco',
        'Egypt', 'Lebanon', 'Syria', 'Libya', 'Palestinian Territories', 'Spain'
    ]
}

In [None]:
def categorize_accent_label(accent_label):
    matched_categories = []

    # Loop through accent categories to look for fuzzy matches.
    for category, countries in category_keywords.items():
        for country in countries:
            # Use fuzzy matching to calculate the similarity score.
            similarity_score = fuzz.partial_ratio(accent_label.lower(), country.lower())

            # If the similarity score is above the threshold, consider it a match.
            if similarity_score >= 90:
                matched_categories.append(category)
                break  # No need to check more countries in the current category.

    if len(matched_categories) > 1 and 'North America' in matched_categories:
        matched_categories.remove('North America')

    return matched_categories

In [None]:
common_voice_accents['Assigned Categories'] = common_voice_accents['Accent'].apply(categorize_accent_label)
common_voice_accents.head()

Unnamed: 0,Accent,MP3_Count,Assigned Categories
0,United States English,435673,[North America]
1,England English,150300,[British Isles]
2,"India and South Asia (India, Pakistan, Sri Lanka)",108836,[South Asia]
3,Canadian English,74732,[]
4,Australian English,54827,[Oceania]


In [None]:
# Let's take a look at the rows where more than one category was assigned.
common_voice_accents[common_voice_accents['Assigned Categories'].apply(lambda x: len(x) > 1)][['Accent', 'Assigned Categories']]
# It looks like these are people who have multiple accents (rather than an issue with the classifying logic).
# Therefore, we will filter out these rows from our final dataset.

Unnamed: 0,Accent,Assigned Categories
16,"England English,New Zealand English","[British Isles, Oceania]"
58,"United States English,Australian English,Engla...","[British Isles, Oceania]"
72,"India and South Asia (India, Pakistan, Sri Lan...","[South Asia, Southeast Asia]"
76,"England English,India and South Asia (India, P...","[British Isles, South Asia]"
94,"United States English,England English,India an...","[British Isles, South Asia]"
111,"United States English,England English,Brazilia...","[British Isles, Latin America]"
112,"England English,Malaysian English","[British Isles, Southeast Asia]"
122,"Romanian,Eastern European","[Middle East and Central Asia, Eastern Europe]"
131,"England English,Southern African (South Africa...","[British Isles, Southern Africa]"
150,"England English,India and South Asia (India, P...","[British Isles, South Asia]"


In [None]:
common_voice_accents = common_voice_accents[common_voice_accents['Assigned Categories'].apply(lambda x: len(x) == 1)]
common_voice_accents['Assigned Category'] = common_voice_accents['Assigned Categories'].apply(lambda x: x[0])
common_voice_accents = common_voice_accents.drop(columns=['Assigned Categories'])
common_voice_accents.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_voice_accents['Assigned Category'] = common_voice_accents['Assigned Categories'].apply(lambda x: x[0])


Unnamed: 0,Accent,MP3_Count,Assigned Category
0,United States English,435673,North America
1,England English,150300,British Isles
2,"India and South Asia (India, Pakistan, Sri Lanka)",108836,South Asia
4,Australian English,54827,Oceania
6,"Southern African (South Africa, Zimbabwe, Nami...",24654,Southern Africa


In [None]:
grouped_df = common_voice_accents.groupby('Assigned Category')['MP3_Count'].sum().reset_index()
grouped_df = grouped_df.sort_values(by='MP3_Count', ascending=False)
grouped_df

Unnamed: 0,Assigned Category,MP3_Count
7,North America,445538
0,British Isles,159062
10,South Asia,109959
8,Oceania,69336
12,Southern Africa,25911
11,Southeast Asia,6218
2,East Asia,4709
1,Caribbean,855
14,Western Europe,496
3,Eastern Europe,465
