# Web scraping to extract data with Beautiful Soup and create a DataFrame with Pandas library

## - Importing the pandas beautiful soup and requests library

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## - Copy and pasting the wikipedia link from where we want to extract the data and storing it in a variable

In [2]:
link='https://en.wikipedia.org/wiki/List_of_largest_companies_in_India'

## - Using the requests library to get the link page data from the link

In [3]:
page=requests.get(link)

## - Creating an instance of the beautiful soup class

In [4]:
soup=BeautifulSoup(page.text,'html')

In [5]:
# print(soup)

### - Use soup.find to find the elements by there name . example- "div", "a", "p", "title"

In [6]:
# element = soup.find(name, attrs, recursive, text, **kwargs)


In [7]:
# soup.find_all('table')[0]

## - Finding all the table elements in the html and storing the first element[0] of the list into the variable - table

In [8]:
table=soup.find_all('table')[0]

In [9]:
# finding all the headers from the table (element - 'tr')
t_headers=table.find_all('th')
t_headers

[<th align="center" colspan="2">Rank
 </th>,
 <th align="center" colspan="2">Forbes <br/>2000 rank
 </th>,
 <th align="center">Name
 </th>,
 <th align="center">Headquarters
 </th>,
 <th align="center">Revenue<br/>(billions <br/>US$)
 </th>,
 <th align="center">Profit<br/>(billions <br/>US$)
 </th>,
 <th align="center">Assets<br/>(billions <br/>US$)
 </th>,
 <th align="center">Value<br/>(billions <br/>US$)
 </th>,
 <th align="center">Industry
 </th>]

In [10]:
# looping through all the headers,extracting text , and stripping them of white spaces
columns=[i.text.strip() for i in t_headers]
columns

['Rank',
 'Forbes 2000 rank',
 'Name',
 'Headquarters',
 'Revenue(billions US$)',
 'Profit(billions US$)',
 'Assets(billions US$)',
 'Value(billions US$)',
 'Industry']

In [11]:
# creating an empty dataframe just with the column names
df=pd.DataFrame(columns=columns)

In [12]:
df

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry


In [13]:
# finding all the rows from the table (element - 'tr')
rows=table.find_all('tr')

In [14]:
# looping throught all the rows and finding the data (element - 'td')
# extracting the text from all the row data and stripping them
# cleaning the useless list values at position 1 and 3 of every list
# adding every row to the data frame by label based indexing at the length of the dataframe
for row in rows[1:]:
    row_data=row.find_all('td')
    individual_row=[ir.text.strip() for ir in row_data]
    selected_elements = [individual_row[i] for i in range(len(individual_row)) if i not in [1, 3]]
    length=len(df)
    df.loc[length]=selected_elements

In [15]:
df

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry
0,1,54,Reliance Industries,Mumbai,86.85,7.81,192.59,228.63,Conglomerate
1,2,130,TATA Group,Mumbai,150.0,139,160.0,350.0,Conglomerate
2,3,105,State Bank of India,Mumbai,54.52,4.32,696.51,58.39,Banking
3,4,154,HDFC Bank,Mumbai,22.51,5.11,280.16,98.28,Banking
4,5,205,ICICI Bank,Mumbai,21.89,3.01,226.39,67.9,Banking
5,6,229,Oil and Natural Gas Corporation,New Delhi,66.28,6.00,75.51,28.62,Oil and gas
6,7,269,HDFC,Mumbai,18.48,2.91,118.61,52.3,Financials
7,8,358,Indian Oil Corporation,New Delhi,72.2,3.72,51.73,16.53,Oil and gas
8,9,385,Tata Consultancy Services,Mumbai,25.73,5.14,18.68,172.79,Infotech
9,10,408,Tata Steel,Mumbai,31.07,5.01,34.62,20.42,Iron and steel


## Repeating the above steps for table 2

In [16]:
table2=soup.find_all('table')[1]

In [17]:
t2Header=table2.find_all('th')

In [18]:
t2_col=[i.text.strip() for i in t2Header]

In [19]:
df2=pd.DataFrame(columns=t2_col)
df2

Unnamed: 0,Rank,Name,Industry,Revenue(in ₹ Crore),Revenue growth,Profits(in ₹ Crore),Headquarters,State Controlled


In [20]:
rows2=table2.find_all('tr')

In [21]:
for row in rows2[1:]:
    row_data=row.find_all('td')
    individual_row=[i.text.strip() for i in row_data]
    length=len(df2)
    df2.loc[length]=individual_row

In [22]:
df2

Unnamed: 0,Rank,Name,Industry,Revenue(in ₹ Crore),Revenue growth,Profits(in ₹ Crore),Headquarters,State Controlled
0,1,Indian Oil Corporation,Oil and gas,424321,13.2%,22189,New Delhi,Yes
1,2,Reliance Industries Limited,Oil and gas,410295,28.2%,36075,Mumbai,
2,3,Oil and Natural Gas Corporation,Oil and gas,333143,11.0%,22106,New Delhi,Yes
3,4,State Bank of India,Banking,306528,2.6%,"−4,556",Mumbai,Yes
4,5,Tata Motors,Automotive,301175,7.9%,8989,Mumbai,
5,6,Bharat Petroleum,Oil and gas,238638,13.7%,9009,Mumbai,Yes
6,7,Hindustan Petroleum,Oil and gas,221693,13.4%,7218,Mumbai,Yes
7,8,Rajesh Exports,Gems and jewellery,187748,22.5%,1266,Bangalore,
8,9,Tata Steel,Iron and steel,147192,25.3%,13434,Mumbai,
9,10,Coal India,Metals and mining,132897,5.3%,7020,Kolkata,Yes


## Storing the table into a csv file(s) 

In [23]:
df.to_csv('/Users/ayushlingwal/Pandas Projects/Web Scrape DataFrame/Forbes_2022.csv',index=False)

In [24]:
df2.to_csv('/Users/ayushlingwal/Pandas Projects/Web Scrape DataFrame/Fortune_2021.csv',index=False)