<a href="https://colab.research.google.com/github/ltfhnbl/AanalisisBigData/blob/main/week2/W2_webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as bs4

In [None]:
continents_page = requests.get("https://simple.wikipedia.org/wiki/List_of_countries_by_continents").text
continents_page

In [None]:
continents_countries_soup = bs4(continents_page,"lxml")
continents = continents_countries_soup.find_all('h2' > 'span', {"class":"mw-headline"})
continents

[<span class="mw-headline" id="Africa">Africa</span>,
 <span class="mw-headline" id="Antarctica">Antarctica</span>,
 <span class="mw-headline" id="Asia">Asia</span>,
 <span class="mw-headline" id="Europe">Europe</span>,
 <span class="mw-headline" id="North_America">North America</span>,
 <span class="mw-headline" id="South_America">South America</span>,
 <span class="mw-headline" id="Oceania">Oceania</span>,
 <span class="mw-headline" id="References">References</span>,
 <span class="mw-headline" id="Other_websites">Other websites</span>]

In [None]:
unwanted_words = ["Antarctica","References","Other websites"]
target_continents = [continent.text for continent in continents if continent.text not in unwanted_words]
target_continents

['Africa', 'Asia', 'Europe', 'North America', 'South America', 'Oceania']

In [None]:
ol_html = continents_countries_soup.find_all('ol')
all_countries = [countries.find_all('li',{"class": None, "id": None}) for countries in ol_html]
all_countries

In [None]:
countries_in_continents = []
for items in all_countries:
    countries = []
    if items:
       for country in items:
           countries = [country.find('a').text for country in items if country.find('a')]
       countries_in_continents.append(countries)
countries_in_continents

[['Algeria',
  'Angola',
  'Benin',
  'Botswana',
  'Burkina Faso',
  'Burundi',
  'Cameroon',
  'Cape Verde',
  'Central African Republic',
  'Chad',
  'Comoros',
  'Republic of the Congo',
  'Democratic Republic of the Congo',
  "Côte d'Ivoire",
  'Djibouti',
  'Equatorial Guinea',
  'Egypt',
  'Eritrea',
  'Ethiopia',
  'Gabon',
  'The Gambia',
  'Ghana',
  'Guinea',
  'Guinea-Bissau',
  'Kenya',
  'Lesotho',
  'Liberia',
  'Libya',
  'Madagascar',
  'Malawi',
  'Mali',
  'Mauritania',
  'Mauritius',
  'Morocco',
  'Mozambique',
  'Namibia',
  'Niger',
  'Nigeria',
  'Réunion',
  'Rwanda',
  'São Tomé and Príncipe',
  'Senegal',
  'Seychelles',
  'Sierra Leone',
  'Somalia',
  'South Africa',
  'South Sudan',
  'Sudan',
  'Swaziland',
  'Tanzania',
  'Togo',
  'Tunisia',
  'Uganda',
  'Western Sahara',
  'Zambia',
  'Zimbabwe'],
 ['Afghanistan',
  'Armenia',
  'Azerbaijan',
  'Bahrain',
  'Bangladesh',
  'Bhutan',
  'Brunei',
  'Cambodia',
  'China',
  'East Timor',
  'Georgia',
  '

In [None]:
countries_continent_category_df = pd.DataFrame(
    zip(countries_in_continents, target_continents), columns=['Country', 'Continents'])
countries_continent_category_df

Unnamed: 0,Country,Continents
0,"[Algeria, Angola, Benin, Botswana, Burkina Fas...",Africa
1,"[Afghanistan, Armenia, Azerbaijan, Bahrain, Ba...",Asia
2,"[Albania, Andorra, Austria, Belarus, Belgium, ...",Europe
3,"[Canada, Mexico, United States of America, Nav...",North America
4,"[Brazil, Argentina, Bolivia, Chile, Colombia, ...",South America
5,"[Australia, Fiji, New Zealand, Federated State...",Oceania


In [None]:
countries_continent_category_df = countries_continent_category_df.explode(
    'Country').reset_index(drop=True)
countries_continent_category_df

Unnamed: 0,Country,Continents
0,Algeria,Africa
1,Angola,Africa
2,Benin,Africa
3,Botswana,Africa
4,Burkina Faso,Africa
...,...,...
201,Samoa,Oceania
202,Solomon Islands,Oceania
203,Tonga,Oceania
204,Tuvalu,Oceania


In [None]:
countries_score_page = requests.get("https://en.wikipedia.org/wiki/World_Happiness_Report#2020_report")
countries_score_soup = bs4(countries_score_page.content,'lxml')

In [None]:
countries_score_table = countries_score_soup.find('table', {'class':'wikitable'})
countries_score_table

<table class="wikitable sortable">
<tbody><tr valign="top">
<th style="width: 10px;">Overall rank
</th>
<th style="width: 250px;">Country or region
</th></tr>
<tr>
<td>1</td>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="1100" data-file-width="1800" decoding="async" height="14" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_Finland.svg/23px-Flag_of_Finland.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_Finland.svg/35px-Flag_of_Finland.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_Finland.svg/46px-Flag_of_Finland.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/Finland" title="Finland">Finland</a>
</td></tr>
<tr>
<td>2</td>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="387" data-file-width="512" decoding="async" height="15" src

In [None]:
countries_score_df = pd.read_html(str(countries_score_table))
countries_score_df

[     Overall rank              Country or region
 0               1                        Finland
 1               2                        Denmark
 2               3                        Iceland
 3               4                         Israel
 4               5                    Netherlands
 ..            ...                            ...
 132           133  Congo, Democratic Republic of
 133           134                       Zimbabwe
 134           135                   Sierra Leone
 135           136                        Lebanon
 136           137                    Afghanistan
 
 [137 rows x 2 columns]]

In [None]:
countries_score_df = countries_score_df[0]
countries_score_df = countries_score_df.rename(columns={"Country or region":"Country"})
countries_score_df

Unnamed: 0,Overall rank,Country
0,1,Finland
1,2,Denmark
2,3,Iceland
3,4,Israel
4,5,Netherlands
...,...,...
132,133,"Congo, Democratic Republic of"
133,134,Zimbabwe
134,135,Sierra Leone
135,136,Lebanon


In [None]:
merged_df = pd.merge(countries_score_df, countries_continent_category_df, how='inner', on='Country')
merged_df.to_csv('final_result.csv')
merged_df

Unnamed: 0,Overall rank,Country,Continents
0,1,Finland,Europe
1,2,Denmark,Europe
2,3,Iceland,Europe
3,4,Israel,Asia
4,5,Netherlands,Europe
...,...,...,...
106,132,Botswana,Africa
107,134,Zimbabwe,Africa
108,135,Sierra Leone,Africa
109,136,Lebanon,Asia
