In [40]:
import pandas as pd

In [41]:
def country_nationality_table():
    '''
    This function reads from the web a table with countries and nationalities. 
    Output: Clean pandas dataframe
    '''
    # Reading the table from a webpage
    tables = pd.read_html("https://www.vocabulary.cl/Basic/Nationalities.htm")
    con = tables[1]
    
    # Cleaning the table
    con["Country"].replace("Colombia *", "Colombia", inplace=True)
    con["Country"].replace("(The) United Kingdom", "United Kingdom", inplace=True)
    con["Country"].replace("(The) United States", "United States", inplace=True)
    con[r"Nationality  (Adjective)"].replace("American **", "American", inplace=True)
    
    # Creating new Nationality column by copying it
    con["Nationality"] = con[r"Nationality  (Adjective)"]
    
    # Fixing Argentina
    con.drop(index=3, inplace=True)
    con = con.append({"Country": "Argentina", "Nationality": "Argentine"}, ignore_index=True)
    con = con.append({"Country": "Argentina", "Nationality": "Argentinian"}, ignore_index=True)
    
    # Adding additional matches and countries
    additional = {"Country": ["U.S.", "U.K.", "U.A.E.", "Angola", "Cyprus", "Hong Kong", "Kazakhstan"], 
                 "Nationality": ["American", "British", "Emirati", "Angolan", "Cypriot", "Hongkonger", "Kazakhstani"]}
    df_to_add = pd.DataFrame(additional, columns=["Country", "Nationality"])
    
    final = pd.concat([con, df_to_add], sort=False)
    
    # Final details; sorting, setting index and deleting unnecessary columns. 
    final.sort_values(by=["Country"], inplace=True)
    final.reset_index(drop=True, inplace=True)
    final.drop(columns=['Nationality  (Adjective)', 'Nationailty  (Noun)', 'Language'], axis=1, inplace=True)
    
    # Saving as csv file
    final.to_csv("../data/processed/CountryNationality.csv", index=False)
    
    # Returning clean dataframe
    return final
    


In [42]:
con = country_nationality_table()
con

Unnamed: 0,Country,Nationality
0,Afghanistan,Afghan
1,Albania,Albanian
2,Algeria,Algerian
3,Angola,Angolan
4,Argentina,Argentinian
...,...,...
114,Venezuela,Venezuelan
115,Vietnam,Vietnamese
116,Wales,Welsh
117,Zambia,Zambian
