# Web Scraping

In [None]:
# !pip install pandas
# !pip install beautifulsoup4
# !pip install requests
# !pip install html5lib

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Extract Wikipedia Data Using Web Scraping
The wikipedia webpage https://en.wikipedia.org/wiki/List_of_largest_banks provides information about largest banks in the world by various parameters. Scrape the data from the table 'By market capitalization' and store it in a JSON file.

### Webpage Contents

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_largest_banks"
html_data  = requests.get(url).text

### Scraping the Data

In [4]:
soup = BeautifulSoup(html_data,'html5lib')

In [5]:
results = []

for row in soup.find_all('table')[0].find_all('tr'):
    td = row.find_all('td')
    if td:
        results.append({"Rank": td[0].text.strip(), "Bank Name": td[1].text.strip(),
                        "Market Cap (US$ Billion)": td[2].text.strip()})
        
df = pd.DataFrame(results)
print(df)

  Rank                                Bank Name Market Cap (US$ Billion)
0    1                           JPMorgan Chase                   419.25
1    2                          Bank of America                   231.52
2    3  Industrial and Commercial Bank of China                   194.56
3    4               Agricultural Bank of China                   160.68
4    5                                HDFC Bank                   157.91
5    6                              Wells Fargo                   155.87
6    7                        HSBC Holdings PLC                   148.90
7    8                           Morgan Stanley                   140.83
8    9                  China Construction Bank                   139.82
9   10                            Bank of China                   136.81


### Display the first five rows

In [6]:
df.head()

Unnamed: 0,Rank,Bank Name,Market Cap (US$ Billion)
0,1,JPMorgan Chase,419.25
1,2,Bank of America,231.52
2,3,Industrial and Commercial Bank of China,194.56
3,4,Agricultural Bank of China,160.68
4,5,HDFC Bank,157.91


## Loading the Data
Extract Data to JSON file called `bank_market_cap.json`

In [7]:
df.to_json("bank_market_cap.json")