# Simple Web Scrape and EDA

__Website:__ [www.CountryCode.org](https://countrycode.org/) <br />
__Data Composition:__ Country Name, Country Telephone Code, ISO Code, Population Amount, Area km2, GDP in USD <br />
__Running Time:__ 3.1 sec <br />
__Project Monetization:__ Non-commercial use <br /><br />
__Author:__ Pedro Sanhueza


### Import libraries

In [1]:
import requests # to download html source from url
from bs4 import BeautifulSoup # find elements in html
import pandas as pd # build data frame
import plotly.express as px # display plots
from datetime import datetime # to save file with current time
import plotly.io as pio
pio.renderers.default='notebook'

### Webscrape URL

In [2]:
response = requests.get("https://countrycode.org/") # download html source from url
soup = BeautifulSoup(response.text, 'html.parser') # make html data readable for BeautifulSoup
ls = [x.get_text() for x in soup.select('td')][:240*6] # collect all values of table into a list

### Build Data Table

In [7]:
data = { # get the 6th item in list starting from 1, 2, 3, 4, 5, and 6th element
'Country' :ls[0::6],
'Country_code' : ls[1::6],
'ISO_codes' : ls[2::6],
'Population' : ls[3::6],
'Area_KM2' : ls[4::6],
'GDP_USD' : ls[5::6]
}

df = pd.DataFrame(data) # build data frame

df # showcase the extraction of the website table


Unnamed: 0,Country,Country_code,ISO_codes,Population,Area_KM2,GDP_USD
0,Afghanistan,93,AF / AFG,29121286,647500,20.65 Billion
1,Albania,355,AL / ALB,2986952,28748,12.8 Billion
2,Algeria,213,DZ / DZA,34586184,2381740,215.7 Billion
3,American Samoa,1-684,AS / ASM,57881,199,462.2 Million
4,Andorra,376,AD / AND,84000,468,4.8 Billion
...,...,...,...,...,...,...
235,Wallis and Futuna,681,WF / WLF,16025,274,
236,Western Sahara,212,EH / ESH,273008,266000,
237,Yemen,967,YE / YEM,23495361,527970,43.89 Billion
238,Zambia,260,ZM / ZMB,13460305,752614,22.24 Billion


### GDP per country plot

In [8]:
# Change 'GDP_USD' column integers

def gdp_value(x): # from str to int
    try:
        y = str(x).split(' ') # divide string in two
        z  = float(y[0]) * float(y[1]) # multiply the original value with the replacement amount
        return int(z) # return the integer of the multiplication
    except:
        return "No Value Found"

replacements = {'Billion':'1000000000', 'Million':'1000000', 'Trillion': '1000000000000'} # key items to be replaced

df['GDP_USD'] = [ gdp_value(x) for x in df.GDP_USD.replace(replacements, regex=True)] # change from strings to integers

df1 = df[(df['GDP_USD'] != "No Value Found")].sort_values(by=['GDP_USD'] )[-10:].copy() # get the top 10 in order

df1

Unnamed: 0,Country,Country_code,ISO_codes,Population,Area_KM2,GDP_USD
37,Canada,1,CA / CAN,33679000,9984670,1825000000000
101,Italy,39,IT / ITA,60340328,301230,2068000000000
175,Russia,7,RU / RUS,140702000,17100000,2113000000000
28,Brazil,55,BR / BRA,201103330,8511965,2190000000000
227,United Kingdom,44,GB / GBR,62348447,244820,2490000000000
72,France,33,FR / FRA,64768389,547030,2739000000000
77,Germany,49,DE / DEU,81802257,357021,3593000000000
104,Japan,81,JP / JPN,127288000,377835,5007000000000
43,China,86,CN / CHN,1330044000,9596960,9330000000000
228,United States,1,US / USA,310232863,9629091,16719999999999


In [9]:
px.bar(df1, x='Country', y='GDP_USD',  title="GDP per Country", text_auto=True) # display bar chart


### Population per country plot

In [5]:
df['Population'] = [ int(x) for x in df.Population.replace(',','', regex=True)] # change from strings to integers

df2 = df.sort_values(by=['Population'], ascending=False)[:10].copy() # get the top 10 in order

px.bar(df2, x='Country', y='Population', title="Top 10 Countries Population Count", text_auto=True, color='Population') # display bar chart

### Population vs Area plot

In [25]:
# df.sort_values('Population')
df['Population'] = [int(x) for x in df.Population.replace(',','', regex=True)]

In [38]:
df.sort_values('Population').tail(1)['Country'].iloc[0]

'China'

In [39]:
df1.sort_values('GDP_USD').tail(1)['Country'].iloc[0]

'United States'

In [6]:
df['Area_KM2'] = [int(x) for x in df.Area_KM2.replace(',','', regex=True)] # change from strings to integers

fig = px.scatter(df, x="Area_KM2", y="Population", color='Area_KM2', text='Country', title="Area vs Population Amount") # build scatter plot

fig.update_traces(textposition='top center') # display plot

### Save Table

In [7]:
# optional:

file_path = '../Country Code - Historical Data/Country Code ' + datetime.now().strftime("%d-%m-%Y %H%M%S") + ".csv" # folder location with file name

df.to_csv(file_path) # save data frame as csv in file_path location
