# Data Scarping from wikipedia to get GDP of countries

### Get the libraries required for the project

In [4]:
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
data = pd.read_html(url)
print(type(data))

<class 'list'>


#### A list of tables in the given web page are returned

### Lets find out which is our required and convert it into a Dataframe

In [5]:
GDP_data = data[2]
GDP_data.head()

Unnamed: 0_level_0,Country/Territory,IMF[1][12],IMF[1][12],World Bank[13],World Bank[13],United Nations[14],United Nations[14]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,27720700,2023
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17794782,[n 1]2023
3,Germany,4744804,2025,4456081,2023,4525704,2023
4,India,4187017,2025,3549919,2023,3575778,2023


#### We can see that 3rd information from web page is our required GDP list of countries.Well,there is a problem here

In [6]:
GDP_data.columns

MultiIndex([( 'Country/Territory', 'Country/Territory'),
            (        'IMF[1][12]',          'Forecast'),
            (        'IMF[1][12]',              'Year'),
            (    'World Bank[13]',          'Estimate'),
            (    'World Bank[13]',              'Year'),
            ('United Nations[14]',          'Estimate'),
            ('United Nations[14]',              'Year')],
           )

In [7]:
print(type(GDP_data.columns))
print(type(GDP_data.columns[0]))

<class 'pandas.core.indexes.multi.MultiIndex'>
<class 'tuple'>


#### The problem here is column headers i.e., features or variables are not strings or objects or our regular data types they are multi Index tuples which makes it difficult for further analysis or visualizations

### Solution to the problem

In [8]:
columns = GDP_data.columns
new_columns = []
for col in columns:
    if isinstance(col,tuple):
        new_col = '_'.join(col).strip()
        new_columns.append(new_col)
    else:
        new_columns.append(col)
GDP_data.columns = new_columns
print(GDP_data.columns)
print('Columns are ',type(GDP_data.columns),' type')


Index(['Country/Territory_Country/Territory', 'IMF[1][12]_Forecast',
       'IMF[1][12]_Year', 'World Bank[13]_Estimate', 'World Bank[13]_Year',
       'United Nations[14]_Estimate', 'United Nations[14]_Year'],
      dtype='object')
Columns are  <class 'pandas.core.indexes.base.Index'>  type


#### We can see that the column headers are now basic index type and are easy to read and work with

### Cleaning the dataset

In [9]:
GDP_data.head()

Unnamed: 0,Country/Territory_Country/Territory,IMF[1][12]_Forecast,IMF[1][12]_Year,World Bank[13]_Estimate,World Bank[13]_Year,United Nations[14]_Estimate,United Nations[14]_Year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,27720700,2023
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17794782,[n 1]2023
3,Germany,4744804,2025,4456081,2023,4525704,2023
4,India,4187017,2025,3549919,2023,3575778,2023


In [None]:
#Let's change the column names and make more readable
column_names = ['Country','IMF_forecast','Forecast_year','World_Bank_Estimation','World_Bank_year','UN_Estimation','UN_year']
GDP_data.columns = column_names #This works only when column_names list have the same no.of original attributes 
GDP_data.head()

Unnamed: 0,Country,IMF_forecast,Forecast_year,World_Bank_Estimation,World_Bank_year,UN_Estimation,UN_year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,27720700,2023
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17794782,[n 1]2023
3,Germany,4744804,2025,4456081,2023,4525704,2023
4,India,4187017,2025,3549919,2023,3575778,2023


#### Now the dataset looks more readale and better understandable

In [11]:
#Working with year columns
print(GDP_data['Forecast_year'].unique())
print(GDP_data['World_Bank_year'].unique())
print(GDP_data['UN_year'].unique())

['2025' '[n 1]2025' '[n 4]2025' '[n 5]2025' '2024' '[n 6]2025' '—'
 '[n 8]2025' '[n 9]2025' '[n 10]2025' '[n 11]2025' '2023']
['2023' '[n 3]2023' '—' '2022' '[n 6]2023' '[n 9]2023' '[n 10]2023' '2021'
 '[n 11]2023' '[n 12]2023']
['2022' '2023' '[n 1]2023' '—' '[n 6]2023' '[n 7]2023' '[n 9]2023'
 '[n 10]2023' '[n 11]2023' '[n 13]2023']


In [12]:
GDP_data['Forecast_year'] = GDP_data['Forecast_year'].astype(str).str.extract(r'(\d{4})')
GDP_data['World_Bank_year'] = GDP_data['World_Bank_year'].astype(str).str.extract(r'(\d{4})')
GDP_data['UN_year'] = GDP_data['UN_year'].astype(str).str.extract(r'(\d{4})')

print(GDP_data['Forecast_year'].unique())
print(GDP_data['World_Bank_year'].unique())
print(GDP_data['UN_year'].unique())

['2025' '2024' nan '2023']
['2023' nan '2022' '2021']
['2022' '2023' nan]


In [13]:
GDP_data.isnull().sum()

Country                   0
IMF_forecast              0
Forecast_year            19
World_Bank_Estimation     0
World_Bank_year          12
UN_Estimation             0
UN_year                   1
dtype: int64

In [14]:
#Replace with most occuring year
freq_year = GDP_data['Forecast_year'].mode()[0]
GDP_data.fillna(freq_year,inplace=True)
GDP_data.isnull().sum()

Country                  0
IMF_forecast             0
Forecast_year            0
World_Bank_Estimation    0
World_Bank_year          0
UN_Estimation            0
UN_year                  0
dtype: int64

#### Finally,we have a clean usable dataset for any type of analysis and visualizations

In [15]:
GDP_data.head()

Unnamed: 0,Country,IMF_forecast,Forecast_year,World_Bank_Estimation,World_Bank_year,UN_Estimation,UN_year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,27720700,2023
2,China,19231705,2025,17794782,2023,17794782,2023
3,Germany,4744804,2025,4456081,2023,4525704,2023
4,India,4187017,2025,3549919,2023,3575778,2023


In [16]:
GDP_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Country                214 non-null    object
 1   IMF_forecast           214 non-null    object
 2   Forecast_year          214 non-null    object
 3   World_Bank_Estimation  214 non-null    object
 4   World_Bank_year        214 non-null    object
 5   UN_Estimation          214 non-null    object
 6   UN_year                214 non-null    object
dtypes: object(7)
memory usage: 11.8+ KB


In [17]:
GDP_data.describe(include='all')

Unnamed: 0,Country,IMF_forecast,Forecast_year,World_Bank_Estimation,World_Bank_year,UN_Estimation,UN_year
count,214,214,214,214,214,214,214
unique,214,195,3,203,4,214,3
top,World,—,2025,—,2023,100834796,2023
freq,1,19,209,12,186,1,212


### Convert this dataset into csv format 

In [19]:
GDP = GDP_data.to_csv(index=False)