# Transform and clean data from World Bank

### data from the ZIP file source : https://datacatalog.worldbank.org/search/dataset/0037712/World-Development-Indicators

because of the large size of the file, we need to use the long data filtred by indicator, country, of years

1) extract data from sheet : data
2) unpivot the data and use years as long
3) drop NAN values

4) FILTER the data
     - filter the years
        3 options : 1 must be selected (1, 2 or 3)
       
     - filter the countries
        from xls file : Resources_external\selection_countries.xlsx
     - filter the indicators
        from xls file : Resources_external\selection_indicators.xlsx

extract indicators : filter of needed indicators and their assignment to categories




10) generate a csv file for each category of indicators

In [18]:
import pandas as pd

In [19]:
# import data

path = '../Resources_external/WDI_EXCEL_2025_01_28.xlsx'
sheet  = 'Data'
data_df = pd.read_excel(path, sheet_name=sheet )

data_df.head(5)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,17.488497,18.001597,18.558234,19.043572,19.586457,20.192064,20.828814,21.372164,22.100884,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.811504,7.096003,7.406706,7.666648,8.020952,8.403358,8.718306,9.097176,9.473374,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,38.15209,38.488233,38.779953,39.068462,39.445526,39.818645,40.276374,40.687817,41.211606,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.871956,33.922276,38.859598,40.223744,43.035073,44.390861,46.282371,48.127211,48.742043,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,17.672943,16.527554,24.627753,25.432092,27.061929,29.154282,31.022083,32.809138,33.760782,


In [20]:
data_df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023'],
      dtype='object')

In [21]:
shape_original = data_df.shape

# unpivot and get the years as a column

data_lg_df = pd.melt(
                    data_df, 
                    id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
                    value_vars=['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
                                '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
                                '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
                                '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
                                '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
                                '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
                                '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
                                '2023'],
                     var_name='years' ,           
                     value_name='Value' )

shape_unpivot = data_lg_df.shape

# drop NAN values

data_lg_df  = data_lg_df.dropna(how='any')

#rename some columns

data_lg_df  = data_lg_df.rename(columns={
                                        'Country Name' : 'country'  , 
                                        'Indicator Name' : 'indicator'
                                        })

## convert years to int

data_lg_df = data_lg_df.astype({'years' : 'int32'})

## rest index (in case we need to create a dataBase)
data_lg_df = data_lg_df.reset_index()

shape_cleanna = data_lg_df.shape

data_lg_df.head()

Unnamed: 0,index,country,Country Code,indicator,Indicator Code,years,Value
0,50,Africa Eastern and Southern,AFE,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,135.793291
1,56,Africa Eastern and Southern,AFE,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.967697
2,57,Africa Eastern and Southern,AFE,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,5.631542
3,58,Africa Eastern and Southern,AFE,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,82.969998
4,84,Africa Eastern and Southern,AFE,Aquaculture production (metric tons),ER.FSH.AQUA.MT,1960,380.0


In [22]:
print(f'shape_oringinal : {shape_original}')
print(f'shape_unpivot : {shape_unpivot}')
print(f'shape_cleanna : {shape_cleanna}')

shape_oringinal : (397936, 68)
shape_unpivot : (25467904, 6)
shape_cleanna : (8888933, 7)


## filter the data

### filter the years

Select the right option

In [45]:
# # choose one option of filter years
    # 1 = 'filter from year x'
    # 2 = 'filter from year x to year Y'
    # 3 = 'filter serie of years'

selection_type = 1

data_filtred = data_lg_df.copy()

# define needed variables
year_st = 2015
year_end = 2025
year_serie = [
                2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022,
                2023]


if selection_type == 1 : 
                lib_years = f'from_{year_st}'   # to be included in the file name
                date = (data_filtred['years']>year_st)
                data_filtred = data_filtred[date]    

elif selection_type == 2 :

                lib_years = f'btween_{year_st}_{year_end}'   # to be included in the file name
                date = (data_lg_df['years']>year_st) & (data_lg_df['years']<year_end)
                data_filtred = data_filtred[date] 

elif selection_type == 3 :
                lib_years = f'{len(year_serie)}_years_btween_{min(year_serie)}_{max(year_serie)}'    # to be included in the file name
                date = (data_lg_df['years'].isin(year_serie)) 
                data_filtred = data_filtred[date] 


data_filtred.head()

Unnamed: 0,index,country,Country Code,indicator,Indicator Code,years,Value
7270501,22284416,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,2016,18.558234
7270502,22284417,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,2016,7.406706
7270503,22284418,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,2016,38.779953
7270504,22284419,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,2016,38.859598
7270505,22284420,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,2016,24.627753


## filter the countries

In [46]:
# import indicators selection

path = "../Resources_external/selection_countries.xlsx"
select_countries = pd.read_excel(path)

## filter the selected indicators
x = (select_countries['select'] == 'y')
select_countries = select_countries[x]
## drop column 'select'
select_countries = select_countries.drop(columns = {'select'})

lib_countries = len(select_countries['country'].unique())
select_countries.head(5)

Unnamed: 0,iso3Code,iso2Code,country,region,capitalCity
0,ARE,AE,United Arab Emirates,Middle East & North Africa,Abu Dhabi
1,BHR,BH,Bahrain,Middle East & North Africa,Manama
2,DJI,DJ,Djibouti,Middle East & North Africa,Djibouti
3,DZA,DZ,Algeria,Middle East & North Africa,Algiers
4,EGY,EG,"Egypt, Arab Rep.",Middle East & North Africa,Cairo


In [47]:
# filter with selected countries
data_filtred = pd.merge(data_filtred, select_countries, on='country')
print(data_filtred.shape)

(1359149, 11)


## filter the indicators

In [48]:
# import indicators selection

path = "../Resources_external/selection_indicators.xlsx"
select_indicators = pd.read_excel(path)

## filter the selected indicators
x = (select_indicators['select'] == 'y')
select_indicators = select_indicators[x]
## drop column 'select'
select_indicators = select_indicators.drop(columns = {'select'})

lib_indicators = len(select_indicators['indicator'].unique())
select_indicators.head(5)

Unnamed: 0,indicator,Indicator Code,Category
0,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,energy
3,Access to electricity (% of population),EG.ELC.ACCS.ZS,energy
14,Account ownership at a financial institution o...,FX.OWN.TOTL.YG.ZS,population
51,Adolescents out of school (% of lower secondar...,SE.SEC.UNER.LO.ZS,education
52,"Adolescents out of school, female (% of female...",SE.SEC.UNER.LO.FE.ZS,education


In [49]:
# filter with selected indicators
data_filtred = pd.merge(data_filtred, select_indicators, on='indicator')
print(data_filtred.shape)

(215556, 13)


In [50]:
data_filtred.head()

Unnamed: 0,index,country,Country Code,indicator,Indicator Code_x,years,Value,iso3Code,iso2Code,region,capitalCity,Indicator Code_y,Category
0,22357720,Afghanistan,AFG,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,2016,28.8,AFG,AF,South Asia,Kabul,EG.CFT.ACCS.ZS,energy
1,22357723,Afghanistan,AFG,Access to electricity (% of population),EG.ELC.ACCS.ZS,2016,97.7,AFG,AF,South Asia,Kabul,EG.ELC.ACCS.ZS,energy
2,22357779,Afghanistan,AFG,Agricultural irrigated land (% of total agricu...,AG.LND.IRIG.AG.ZS,2016,6.48114,AFG,AF,South Asia,Kabul,AG.LND.IRIG.AG.ZS,agriculture
3,22357780,Afghanistan,AFG,Agricultural land (% of land area),AG.LND.AGRI.ZS,2016,58.123668,AFG,AF,South Asia,Kabul,AG.LND.AGRI.ZS,agriculture
4,22357781,Afghanistan,AFG,Agricultural land (sq. km),AG.LND.AGRI.K2,2016,379100.0,AFG,AF,South Asia,Kabul,AG.LND.AGRI.K2,agriculture


In [51]:
data_filtred.columns

Index(['index', 'country', 'Country Code', 'indicator', 'Indicator Code_x',
       'years', 'Value', 'iso3Code', 'iso2Code', 'region', 'capitalCity',
       'Indicator Code_y', 'Category'],
      dtype='object')

In [52]:
data_filtred = data_filtred.drop(columns=['Country Code', 'Indicator Code_x',
       'iso2Code', 'region', 'capitalCity', 'Category'
       ])
data_filtred = data_filtred.rename(columns={'Indicator Code_y' : 'Indicator Code'})
    

data_filtred = data_filtred[['index','country', 'indicator','Indicator Code', 'years', 'Value']]


data_filtred

Unnamed: 0,index,country,indicator,Indicator Code,years,Value
0,22357720,Afghanistan,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,2016,28.800000
1,22357723,Afghanistan,Access to electricity (% of population),EG.ELC.ACCS.ZS,2016,97.700000
2,22357779,Afghanistan,Agricultural irrigated land (% of total agricu...,AG.LND.IRIG.AG.ZS,2016,6.481140
3,22357780,Afghanistan,Agricultural land (% of land area),AG.LND.AGRI.ZS,2016,58.123668
4,22357781,Afghanistan,Agricultural land (sq. km),AG.LND.AGRI.K2,2016,379100.000000
...,...,...,...,...,...,...
215551,25467843,Zimbabwe,Unemployment with advanced education (% of tot...,SL.UEM.ADVN.ZS,2023,5.949000
215552,25467846,Zimbabwe,Unemployment with basic education (% of total ...,SL.UEM.BASC.ZS,2023,8.581000
215553,25467849,Zimbabwe,Unemployment with intermediate education (% of...,SL.UEM.INTM.ZS,2023,11.282000
215554,25467856,Zimbabwe,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,2023,8.759000


## save as csv

In [53]:
path = f'../Resources_Output/macro_economic_data_long_filtred.csv'
data_filtred.to_csv(path)

# Report
print(f'file printed for ')
print(f'Years :  {lib_years}')
print(f'number of Countries : {lib_countries}  ')
print(f'Number of Indicators : {lib_indicators} ')


file printed for 
Years :  from_2015
number of Countries : 217  
Number of Indicators : 183 
