In [24]:
import pandas as pd 

In [25]:
# Read in csv
df = pd.read_csv('/Users/kevingiesen/Library/Mobile Documents/com~apple~CloudDocs/BIPM Master/Semester 2/Big Data/Big Data Project 4/HappyGraphs/app/world_bank_data.csv')

In [26]:
# Show columns
df.columns

Index(['country', 'date', 'indicator_name', 'value'], dtype='object')

In [27]:
# Pivotize data
df = df.pivot(index=['country', 'date'], columns='indicator_name', values='value').reset_index()


In [28]:
# Remove whitespaces from column names
df = df.rename(columns=lambda x: x.strip())


In [29]:
df

indicator_name,country,date,Access to clean fuels and technologies for cooking,Access to electricity,Agricultural methane emissions,CO2 emissions,Energy use,Forest area,GDP growth % mostly above 0 (but decreasing),Inflation,...,Proportion of seats held by women in national parliaments,Refugee population,Renewable energy consumption % stagnates,Sanitation service,Scientific technical journal articles,Suicides,Total greenhouse gas emissions,Vulnerable employment female,Vulnerable employment male,"Vulnerable employment, total"
0,Afghanistan,1960,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,1961,,,,,,,,,...,,,,,,,,,,
2,Afghanistan,1962,,,,,,,,,...,,,,,,,,,,
3,Afghanistan,1963,,,,,,,,,...,,,,,,,,,,
4,Afghanistan,1964,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16412,Zimbabwe,2017,29.8,44.178635,6246.328,0.700660,,45.451183,4.080264,0.893962,...,32.575758,7566.0,82.46,36.941673,334.71,15.0,,76.110930,55.602293,66.126852
16413,Zimbabwe,2018,29.9,45.572647,6416.155,0.822618,,45.332093,5.009867,10.618866,...,31.481481,7795.0,80.23,36.357160,406.23,14.0,,75.989536,55.486752,66.002542
16414,Zimbabwe,2019,30.1,46.781475,6535.140,0.765887,,45.213002,-6.332446,255.304991,...,31.851852,8956.0,81.50,35.774337,431.62,14.1,,76.875290,57.563610,67.463783
16415,Zimbabwe,2020,30.4,52.747669,,,,45.093912,-7.816951,557.201817,...,31.851852,9261.0,,35.192363,480.16,,,77.471420,58.673250,68.249560


In [30]:
# Drop columns with to many Nulls
#df.drop(['Tobacco use', 'Births attended by skilled health staff', 'Poverty gap', 'Marine protected areas', 'Urban land area', 'Literacy rate'], axis=1, inplace=True)


In [31]:
# Delete data for the year 2022 because there is nearly no data 
df = df[df['date'] < 2022]

In [32]:
# Restructuring the df
df_restructured = pd.melt(df, id_vars=['country', 'date'], var_name='indicator_name', value_name='value')


In [33]:
# Print data
df_restructured

Unnamed: 0,country,date,indicator_name,value
0,Afghanistan,1960,Access to clean fuels and technologies for coo...,
1,Afghanistan,1961,Access to clean fuels and technologies for coo...,
2,Afghanistan,1962,Access to clean fuels and technologies for coo...,
3,Afghanistan,1963,Access to clean fuels and technologies for coo...,
4,Afghanistan,1964,Access to clean fuels and technologies for coo...,
...,...,...,...,...
426837,Zimbabwe,2017,"Vulnerable employment, total",66.126852
426838,Zimbabwe,2018,"Vulnerable employment, total",66.002542
426839,Zimbabwe,2019,"Vulnerable employment, total",67.463783
426840,Zimbabwe,2020,"Vulnerable employment, total",68.249560


# Data cleaning in four steps

# First cleaning step 
delete indicators for countries if there are less then 10 entries

In [34]:
# List of all indicators 
indicators = df_restructured['indicator_name'].unique()

# List of all countries
countries = df_restructured['country'].unique()


In [35]:
# Dictionary to store which countriy indicators have been deleted 
country_del_indicators = []

# Loop over all countries and indicators
for country in countries:
    for indicator in indicators:

        # Count not NA's for a countries indicator
        not_null_count = df_restructured.loc[(df_restructured["country"] == country) & (df_restructured["indicator_name"] == indicator), "value"].count()

        # If there are less then 10 datapoints delete indicator for country
        if not_null_count < 10: 
            # Create a string for the deleted country indicator
            country_indi = country + '_' + indicator
            # Store deleted country indicator 
            country_del_indicators = country_del_indicators + [country_indi]
           
            # Remove the rows corresponding to the indicator for the country
            df_restructured = df_restructured.loc[(df_restructured["country"] != country) | (df_restructured["indicator_name"] != indicator)]


In [36]:
# Show deleted indicators for country
country_del_indicators

['Afghanistan_Energy use',
 'Africa Eastern and Southern_Scientific technical journal articles',
 'Africa Eastern and Southern_Total greenhouse gas emissions',
 'Africa Western and Central_Scientific technical journal articles',
 'Africa Western and Central_Total greenhouse gas emissions',
 'American Samoa_Access to clean fuels and technologies for cooking',
 'American Samoa_Access to electricity',
 'American Samoa_Agricultural methane emissions',
 'American Samoa_CO2 emissions',
 'American Samoa_Energy use',
 'American Samoa_Inflation',
 'American Samoa_Labor force female',
 'American Samoa_Labor force total',
 'American Samoa_Life expectancy',
 'American Samoa_Military expenditure',
 'American Samoa_Mortality caused by road traffic',
 'American Samoa_Proportion of seats held by women in national parliaments',
 'American Samoa_Refugee population',
 'American Samoa_Scientific technical journal articles',
 'American Samoa_Suicides',
 'American Samoa_Vulnerable employment female',
 'Amer

In [37]:
# Show Null values
df_restructured.info()

<class 'pandas.core.frame.DataFrame'>
Index: 383457 entries, 0 to 426841
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   country         383457 non-null  object 
 1   date            383457 non-null  int64  
 2   indicator_name  383457 non-null  object 
 3   value           199716 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 14.6+ MB


# Second cleaning step 
Clean columns accorinding to Big Data Project.xlsx sheet indicators. If there is not a lot of data in some years for a feature the nulls will be deleted

In [38]:
# Store copy
df_dels = df_restructured.copy()

In [39]:
df_dels.indicator_name.unique()

array(['Access to clean fuels and technologies for cooking',
       'Access to electricity', 'Agricultural methane emissions',
       'CO2 emissions', 'Energy use', 'Forest area',
       'GDP growth % mostly above 0 (but decreasing)', 'Inflation',
       'Labor force female', 'Labor force total', 'Life expectancy',
       'Military expenditure', 'Mortality caused by road traffic',
       'Open defecation',
       'People using at least basic drinking water services',
       'Population density',
       'Proportion of seats held by women in national parliaments',
       'Refugee population', 'Renewable energy consumption % stagnates',
       'Sanitation service', 'Scientific technical journal articles',
       'Suicides', 'Total greenhouse gas emissions',
       'Vulnerable employment female', 'Vulnerable employment male',
       'Vulnerable employment, total'], dtype=object)

In [40]:
# Drop Null rows for People using at least basic drinking water services year, suicides, open defecation sanitation service before 2000
df_dels.drop(df_dels.loc[((df_dels["indicator_name"] == 'People using at least basic drinking water services') | 
                          (df_dels["indicator_name"] == 'Suicides') | 
                          (df_dels["indicator_name"] == 'Open defecation') | 
                          (df_dels["indicator_name"] == 'Sanitation service')) & 
                         (df_dels["date"] < 2000) & 
                         df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for Vulnerable employment indicators before 1991
df_dels.drop(df_dels.loc[((df_dels["indicator_name"] == 'Vulnerable employment male') | 
                          (df_dels["indicator_name"] == 'Vulnerable employment female') | 
                          ((df_dels["indicator_name"] == 'Vulnerable employment, total') & 
                           (df_dels["date"] < 1991))) & 
                           df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for inflation  before 1984
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Inflation') &
                           (df_dels["date"] < 1984))) & 
                           df_dels["value"].isnull()].index, inplace=True)


# Drop Null rows for GDP growth  before 71
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'GDP growth % mostly above 0 (but decreasing)') &
                           (df_dels["date"] < 1971))) & 
                           df_dels["value"].isnull()].index, inplace=True)



# Drop Null rows for Labor force indicators before 1989
df_dels.drop(df_dels.loc[((df_dels["indicator_name"] == 'Labor force total') | 
                          ((df_dels["indicator_name"] == 'Labor force female') & 
                           (df_dels["date"] < 1991))) & 
                           df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for Military expentiture force indicators before 1981
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Military expenditure') &
                           (df_dels["date"] < 1981))) & 
                           df_dels["value"].isnull()].index, inplace=True)


# Drop Null rows for proportion of seats held by women in national parliaments before 1997
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Proportion of seats held by women in national parliaments') &
                           (df_dels["date"] < 1997))) & 
                           df_dels["value"].isnull()].index, inplace=True)


# Drop Null rows for scientific technical journal articles before 1996
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Scientific technical journal articles') &
                           (df_dels["date"] < 1996))) & 
                           df_dels["value"].isnull()].index, inplace=True)


# Drop Null rows for mortality caused by road traffic before 2000
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Mortality caused by road traffic') &
                           (df_dels["date"] < 2000))) & 
                           df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for access to electricity before 1994
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Access to electricity') &
                           (df_dels["date"] < 1994))) & 
                           df_dels["value"].isnull()].index, inplace=True)


# Drop Null rows for access to clean fuels and technologies for cooking before 2000
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Access to clean fuels and technologies for cooking') &
                           (df_dels["date"] < 2000))) & 
                           df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for refugee population, forest area and agricultural methane emissions before 2000
df_dels.drop(df_dels.loc[((df_dels["indicator_name"] == 'Refugee population') | 
                          (df_dels["indicator_name"] == 'Forest area') | 
                          (df_dels["indicator_name"] == 'CO2 emissions') | 
                          (df_dels["indicator_name"] == 'Renewable energy consumption % stagnates') | 
                          ((df_dels["indicator_name"] == 'Agricultural methane emissions') & 
                           (df_dels["date"] < 1990))) & 
                           df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for Energy use before 1971
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Energy use') &
                           (df_dels["date"] < 1971))) & 
                           df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for Total greenhouse gas emissions before 1991
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Total greenhouse gas emissions') &
                           (df_dels["date"] < 1991))) & 
                           df_dels["value"].isnull()].index, inplace=True)

# Drop Null rows for Population density before 1961
df_dels.drop(df_dels.loc[(((df_dels["indicator_name"] == 'Population density') &
                           (df_dels["date"] < 1961))) & 
                           df_dels["value"].isnull()].index, inplace=True)


In [41]:
# Show remaining Nulls
df_dels.info()

<class 'pandas.core.frame.DataFrame'>
Index: 212166 entries, 40 to 426841
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   country         212166 non-null  object 
 1   date            212166 non-null  int64  
 2   indicator_name  212166 non-null  object 
 3   value           199716 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 8.1+ MB


# Third step filling with linear regression
Check the Proportion of not Null Values to Null Values. If the not Null Data for an country-indicator is 70% or higher the Null Values will be filled by a Linear Regression.  
If a countries Proportion of Not Nulls is lower than 30% the indicator will be deleted for the country

In [42]:
# List of all indicators 
indicators = df_dels['indicator_name'].unique()

# List of all countries
countries = df_dels['country'].unique()

# Loop over all countries and indicators
for country in countries:
    for indicator in indicators:

        # Calculate the proportian of Nulls 
        # Count not null values for the specified country and indicator
        not_null_count = df_dels.loc[(df_dels['country'] == country) & (df_dels['indicator_name'] == indicator), 'value'].count()

        # Count total values for the specified country and indicator
        total_count = df_dels.loc[(df_dels['country'] == country) & (df_dels['indicator_name'] == indicator), 'value'].size

        # Calculate the proportion of not null
        not_null_proportion = not_null_count/total_count



        # If the not null proportian for a country is less then 30% delete the indicator for the country
        if not_null_proportion < 0.3: 
            # Create a string for the deleted country indicator
            country_indi = country + '_' + indicator
            # Store deleted country indicator 
            country_del_indicators = country_del_indicators + [country_indi]
           
            # Remove the rows corresponding to the indicator for the country
            df_dels = df_dels.loc[(df_dels["country"] != country) | (df_dels["indicator_name"] != indicator)]
            print(country_indi)

        # If the not Null proportian for a country is higher then 70% fill the Nulls with a Linear Regression
        elif not_null_proportion >= 0.70: 
            df_dels.loc[(df_dels['country'] == country) & (df_dels['indicator_name'] == indicator), 'value'].interpolate(method='linear', inplace=True)

            


  not_null_proportion = not_null_count/total_count


In [43]:
df_dels.info()

<class 'pandas.core.frame.DataFrame'>
Index: 212166 entries, 40 to 426841
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   country         212166 non-null  object 
 1   date            212166 non-null  int64  
 2   indicator_name  212166 non-null  object 
 3   value           199716 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 8.1+ MB


# Fourth Step: Deleting the remaining NA's
If there are still empty rows within the Dataframe simply drop the rows 

In [44]:
# Drop remaining NA's
df_cleaned = df_dels.dropna()

In [45]:
# Store csv
#df_cleaned.to_csv('/Users/kevingiesen/Library/Mobile Documents/com~apple~CloudDocs/BIPM Master/Semester 2/Big Data/Big Data Project 2/SS23-BIPM-Big-Data-Group-KMJ-Do-Gooders/app/world_bank_data_clean_v2.csv', index=False)