In [1]:
import pandas as pd
from hashlib import blake2b
import re
import pycountry
import numpy as np
import json

df = pd.read_csv("../data/community_builders_raw.csv").drop(columns=["cbName", "cbBioURL", "dateCreated", "dateUpdated"]) 

# anonymize builder id
df['id'] = df['id'].apply(lambda x: blake2b(str(x).encode(), digest_size=10).hexdigest())
df.head()

Unnamed: 0,id,cbCategory,cbLocation,year,country,region
0,e21ebe5d459aabe2bb9b,Front-End Web & Mobile Builder since 2023,"Dhaka, Bangladesh",2023.0,Bangladesh,Asia Pacific
1,5c9fa88eacddd0fa2dce,Serverless Builder since 2023,India,2023.0,India,Asia Pacific
2,83bdd669bd9adff56c19,Serverless Builder since 2021,"Denver, USA",2021.0,United States,North America
3,8b6a321903b0e320d723,Security Builder since 2024,"Annapolis, USA",2024.0,United States,North America
4,c419e576611b85049002,Containers Builder since 2024,"Lome, Togo",2024.0,"Europe, Middle East, & Africa",


In [2]:
# getting 3 columns from cbCategory: category, year and the phrase
# category will be used for categorization while year will be for QC

df[['Category', 'phrase', 'year_since']] = df['cbCategory'].str.extract(
    r'(.*?)\s*(Builder|since|Builder since)\s*(\d{4})',
    flags=re.IGNORECASE
)
df['Category'] = df['Category'].str.strip()

In [3]:
# check 1 if the process what successful
df[df['Category'].isna()]

Unnamed: 0,id,cbCategory,cbLocation,year,country,region,Category,phrase,year_since


In [4]:
# check 2 if the process what successful
df['year_since'].unique()

array(['2023', '2021', '2024', '2022', '2020'], dtype=object)

In [5]:
# check 3 if the process what successful
df['phrase'].unique()

array(['Builder since', 'since'], dtype=object)

In [6]:
# check categories
df['Category'].unique()

array(['Front-End Web & Mobile', 'Serverless', 'Security', 'Containers',
       'Machine Learning', 'Cloud Operations', 'Dev Tools', 'Data',
       'Game Tech', 'Storage', 'Networking & Content Delivery',
       'Network Content & Delivery', 'Graviton Arm', 'GameTech'],
      dtype=object)

In [7]:
# fix the categories
df['Category'] = df['Category'].replace(
    {
        'GameTech': 'Game Tech',
        'Network Content & Deliver': 'Networking & Content Delivery'
    }
)

# recheck categories
df['Category'].unique()

array(['Front-End Web & Mobile', 'Serverless', 'Security', 'Containers',
       'Machine Learning', 'Cloud Operations', 'Dev Tools', 'Data',
       'Game Tech', 'Storage', 'Networking & Content Delivery',
       'Network Content & Delivery', 'Graviton Arm'], dtype=object)

In [8]:
# check for missing year in the original year cols
df['year'].unique()

array([2023., 2021., 2024., 2022., 2020.,   nan])

In [9]:
df[df['year'].isna()]

Unnamed: 0,id,cbCategory,cbLocation,year,country,region,Category,phrase,year_since
1435,4b1a3832e6277a6b881a,Machine Learning Builder since 2022,"Temuco, Chile",,Chile,Latin America,Machine Learning,Builder since,2022


In [10]:
# check for missing year in the extracted year cols 
df['year_since'].unique()

array(['2023', '2021', '2024', '2022', '2020'], dtype=object)

In [11]:
# drop columns after cleaning and checks
drop_cols_2 = ['cbCategory', 'year', 'phrase']
df = df.drop(columns=drop_cols_2)

In [12]:
df[df['country'].isna()]

Unnamed: 0,id,cbLocation,country,region,Category,year_since
205,176d6de9065eda609b30,"Buea, Cameroon",,,Networking & Content Delivery,2022
276,0da2b0adc5b494bf92d9,"Toronto, Nicaragua",,,Networking & Content Delivery,2021
596,ce0ad5af9ef110eba2ce,"Dar es salaam, Tanzania",,,Networking & Content Delivery,2022


In [13]:
# Update country for specific rows (indices 205, 276, 596)
df.loc[[205, 276, 596], 'country'] = df.loc[[205, 276, 596], 'cbLocation'].str.split(',').str[1].str.strip()
df.iloc[[205, 276, 596]]

Unnamed: 0,id,cbLocation,country,region,Category,year_since
205,176d6de9065eda609b30,"Buea, Cameroon",Cameroon,,Networking & Content Delivery,2022
276,0da2b0adc5b494bf92d9,"Toronto, Nicaragua",Nicaragua,,Networking & Content Delivery,2021
596,ce0ad5af9ef110eba2ce,"Dar es salaam, Tanzania",Tanzania,,Networking & Content Delivery,2022


In [14]:
# validate the countries present
df['country'] = df['country'].str.strip()
countries_ = [*df['country'].unique()]
len(countries_)

114

In [15]:
df[df['country'].isna()]

Unnamed: 0,id,cbLocation,country,region,Category,year_since


In [16]:
pyc_countries = set(country.name for country in pycountry.countries)

In [17]:
np.array([c for c in countries_ if c not in pyc_countries])

array(['Europe, Middle East, & Africa', 'Czech Republic',
       'Bosnia & Herzegovina', 'Asia Pacific', 'Russia', 'Scotland',
       'Central & Eastern Europe', 'Vietnam', 'Turkey', 'Korea',
       'Venezuela', 'Taiwan', 'Northern Ireland', 'Americas',
       'Latin America', 'Tanzania', 'Palestine', 'Europe', 'UK/IR',
       'North America', 'Syria', 'Wales', 'Moldova'], dtype='<U29')

In [18]:
invalid_countries = ['Europe, Middle East, & Africa', 'Asia Pacific', 
                     'Central & Eastern Europe', 'Korea', "Northern Ireland", 
                     'Americas', 'Latin America', 'Europe', 'UK/IR', 'North America']
df_val_countries = df[~df['country'].isin(invalid_countries)]
filtered_df_invalid_countries = df[df['country'].isin(invalid_countries)]

filtered_df_invalid_countries.head()

Unnamed: 0,id,cbLocation,country,region,Category,year_since
4,c419e576611b85049002,"Lome, Togo","Europe, Middle East, & Africa",,Containers,2024
75,f95ffe386c6dd16a7f09,Japan,Asia Pacific,,Cloud Operations,2021
83,d68bb8f5ab815c020bca,"Kathmandu, Nepal",Asia Pacific,,Containers,2021
93,d90c352b2bc54d78c8ba,"Cotonou, Benin","Europe, Middle East, & Africa",,Containers,2022
96,404784f9c415e70bebea,"Sagamihara, Japan",Asia Pacific,,Serverless,2020


In [19]:
def extract_country(location):
    if pd.isna(location):
        return None
    if ',' in location:
        return location.split(',')[-1].strip()
    return location.strip()

filtered_df_invalid_countries['country_'] = filtered_df_invalid_countries['cbLocation'].apply(extract_country)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_invalid_countries['country_'] = filtered_df_invalid_countries['cbLocation'].apply(extract_country)


In [20]:
filtered_df_invalid_countries.head()

Unnamed: 0,id,cbLocation,country,region,Category,year_since,country_
4,c419e576611b85049002,"Lome, Togo","Europe, Middle East, & Africa",,Containers,2024,Togo
75,f95ffe386c6dd16a7f09,Japan,Asia Pacific,,Cloud Operations,2021,Japan
83,d68bb8f5ab815c020bca,"Kathmandu, Nepal",Asia Pacific,,Containers,2021,Nepal
93,d90c352b2bc54d78c8ba,"Cotonou, Benin","Europe, Middle East, & Africa",,Containers,2022,Benin
96,404784f9c415e70bebea,"Sagamihara, Japan",Asia Pacific,,Serverless,2020,Japan


In [21]:
filtered_df_invalid_countries['country_'].unique()

array(['Togo', 'Japan', 'Nepal', 'Benin', 'Bangladesh', 'Mozambique',
       'Romania', 'Aoterora', 'Sweden and Spain', 'UK', 'South Korea',
       'Cameroon', 'Northern Ireland', 'United States of America',
       'Saint Lucia', 'Zimbabwe', 'Aruba', 'Spain', 'Belize', 'Korea',
       'UK&I', 'Andorra', 'Barbados', 'St. Lucia', 'Scotland',
       'Palestine', 'Democratic Republic of the Congo', 'SriLanka',
       'Rwanda', 'Kurdistan', 'Portugal', 'Somaliland', 'United Kingdom',
       'Republic of Ireland', 'Republic of Moldova', 'France'],
      dtype=object)

In [22]:
# Dictionary for correcting country names
country_corrections = {
    'Aoterora': 'New Zealand', 
    'Sweden and Spain': 'Spain',
    'UK': 'United Kingdom', 
    'Korea': 'South Korea',
    'UK&I': 'United Kingdom',
    'Northern Ireland': 'United Kingdom',
    'Scotland': 'United Kingdom', 
    'United States of America': 'United States',
    'St. Lucia': 'Saint Lucia',
    'SriLanka': 'Sri Lanka',
    'Somaliland': 'Somalia', 
    'Kurdistan': 'Turkey',  
    'Republic of Ireland': 'Ireland',
    'Republic of Moldova': 'Moldova'
}

# Apply corrections to the country column
filtered_df_invalid_countries['country_'] = filtered_df_invalid_countries['country_'].replace(country_corrections)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_invalid_countries['country_'] = filtered_df_invalid_countries['country_'].replace(country_corrections)


In [23]:
filtered_df_invalid_countries['country_'].unique()

array(['Togo', 'Japan', 'Nepal', 'Benin', 'Bangladesh', 'Mozambique',
       'Romania', 'New Zealand', 'Spain', 'United Kingdom', 'South Korea',
       'Cameroon', 'United States', 'Saint Lucia', 'Zimbabwe', 'Aruba',
       'Belize', 'Andorra', 'Barbados', 'Palestine',
       'Democratic Republic of the Congo', 'Sri Lanka', 'Rwanda',
       'Turkey', 'Portugal', 'Somalia', 'Ireland', 'Moldova', 'France'],
      dtype=object)

In [24]:
df_val_countries.columns

Index(['id', 'cbLocation', 'country', 'region', 'Category', 'year_since'], dtype='object')

In [25]:
df_val_countries.shape

(2129, 6)

In [26]:
df_val_countries_2 = filtered_df_invalid_countries \
    .drop(columns=['country']) \
    .rename(columns={'country_': 'country'})

In [27]:
df_val_countries_2.columns

Index(['id', 'cbLocation', 'region', 'Category', 'year_since', 'country'], dtype='object')

In [28]:
# Concatenate the DataFrames
result = pd.concat([df_val_countries, df_val_countries_2], ignore_index=True)
result.head()

Unnamed: 0,id,cbLocation,country,region,Category,year_since
0,e21ebe5d459aabe2bb9b,"Dhaka, Bangladesh",Bangladesh,Asia Pacific,Front-End Web & Mobile,2023
1,5c9fa88eacddd0fa2dce,India,India,Asia Pacific,Serverless,2023
2,83bdd669bd9adff56c19,"Denver, USA",United States,North America,Serverless,2021
3,8b6a321903b0e320d723,"Annapolis, USA",United States,North America,Security,2024
4,edbade2a45c0c2950103,"Paris, France",France,"Europe, Middle East, & Africa",Machine Learning,2024


In [29]:
result['country'].unique()

array(['Bangladesh', 'India', 'United States', 'France', 'Egypt',
       'Indonesia', 'United Arab Emirates', 'Pakistan', 'United Kingdom',
       'Saudi Arabia', 'Mexico', 'New Zealand', 'Canada', 'Nepal',
       'Nigeria', 'Sri Lanka', 'Czech Republic', 'Poland', 'Kenya',
       'South Africa', 'Spain', 'Hungary', 'Iraq', 'Bosnia & Herzegovina',
       'Malaysia', 'Qatar', 'Singapore', 'Russia', 'Finland', 'Japan',
       'Australia', 'Brazil', 'Colombia', 'Serbia', 'Kazakhstan', 'Italy',
       'Hong Kong', 'Scotland', 'Germany', 'Thailand', 'Peru', 'Romania',
       'Ecuador', 'Israel', 'Belarus', 'Sweden', 'Vietnam', 'Costa Rica',
       'Switzerland', 'Chile', 'Austria', 'Cameroon', 'North Macedonia',
       'Ukraine', 'Philippines', 'Denmark', 'Jamaica', 'Nicaragua',
       'Norway', 'Netherlands', 'Turkey', 'Mauritius', 'Argentina',
       'Venezuela', 'Taiwan', 'Ireland', 'Portugal', 'China', 'Panama',
       'Greece', 'Tunisia', 'Botswana', 'Bulgaria', 'Ghana', 'Tanzania',
  

In [30]:
result['region'].unique()

array(['Asia Pacific', 'North America', 'Europe, Middle East, & Africa',
       'Latin America', nan, 'United States', 'Europe'], dtype=object)

In [31]:
with open('../data/country_regions.json', 'r') as file:
    country_region_mapping = json.load(file)
    
result['region_'] = result['country'].map(country_region_mapping)
result.head()

Unnamed: 0,id,cbLocation,country,region,Category,year_since,region_
0,e21ebe5d459aabe2bb9b,"Dhaka, Bangladesh",Bangladesh,Asia Pacific,Front-End Web & Mobile,2023,Asia Pacific
1,5c9fa88eacddd0fa2dce,India,India,Asia Pacific,Serverless,2023,Asia Pacific
2,83bdd669bd9adff56c19,"Denver, USA",United States,North America,Serverless,2021,North America
3,8b6a321903b0e320d723,"Annapolis, USA",United States,North America,Security,2024,North America
4,edbade2a45c0c2950103,"Paris, France",France,"Europe, Middle East, & Africa",Machine Learning,2024,"Europe, Middle East, & Africa"


In [32]:
result['region_'].unique()

array(['Asia Pacific', 'North America', 'Europe, Middle East, & Africa',
       'Latin America'], dtype=object)

In [33]:
result[result['region_'].isna()]

Unnamed: 0,id,cbLocation,country,region,Category,year_since,region_


In [34]:
df = result.drop(columns=['region', 'cbLocation']).rename(
    columns= {
        "id": "ID",
        "country": "Country",
        "region_": "Region",
        "year_since": "Year"
    }
)

In [35]:
df.to_csv('community_builders.csv', index=False)