# Import 2014-2024 Dataset into Python

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel("INFORM2024_TREND_2014_2023_v67_ALL.xlsx")

In [3]:
df.head()

Unnamed: 0,Iso3,IndicatorId,IndicatorName,IndicatorScore,SurveyYear,Indicator Type,INFORMYear
0,AFG,AFF_DR,People affected by drought (absolute) - raw,886000.0,2022,Core Indicators,2024
1,AGO,AFF_DR,People affected by drought (absolute) - raw,197920.457143,2022,Core Indicators,2024
2,ALB,AFF_DR,People affected by drought (absolute) - raw,91428.571429,2022,Core Indicators,2024
3,ARE,AFF_DR,People affected by drought (absolute) - raw,0.0,2022,Core Indicators,2024
4,ARG,AFF_DR,People affected by drought (absolute) - raw,1000.914286,2022,Core Indicators,2024


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473668 entries, 0 to 473667
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Iso3            473668 non-null  object 
 1   IndicatorId     473668 non-null  object 
 2   IndicatorName   472511 non-null  object 
 3   IndicatorScore  473668 non-null  float64
 4   SurveyYear      473668 non-null  int64  
 5   Indicator Type  473668 non-null  object 
 6   INFORMYear      473668 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 25.3+ MB


In [5]:
df.isna().sum()

Iso3                 0
IndicatorId          0
IndicatorName     1157
IndicatorScore       0
SurveyYear           0
Indicator Type       0
INFORMYear           0
dtype: int64

## Checking the number of categories in respective columns

In [6]:
print("There are " + str(len(df['Iso3'].unique())) + " countries / territories in the dataframe")

sortISO = sorted(df['Iso3'].unique())
print("Country ISOs in the dataframe: " + str(sortISO))

There are 191 countries / territories in the dataframe
Country ISOs in the dataframe: ['AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NRU', 'NZ

In [7]:
print("There are " + str(len(df['IndicatorId'].unique())) + " indicator IDs in the dataframe")

#sortId = sorted(df['IndicatorId'].unique())
#print("Indicator IDs in the dataframe: " + str(sortId))

There are 272 indicator IDs in the dataframe


In [33]:
print("There are " + str(len(df['IndicatorName'].unique())) + " indicator names in the dataframe")
#there are missing values in indicator name column - better not to split on this column
#sortName = sorted(df['IndicatorName'].unique())
#indicator_names_str = ", ".join(sortName)
#print("Indicator names in the dataframe: " + indicator_names_str)

print("There are " + str(len(df['Indicator Type'].unique())) + " indicator types in the dataframe")

print("There are " + str(len(df['SurveyYear'].unique())) + " survey years in the dataframe")

print("There are " + str(len(df['INFORMYear'].unique())) + " INFORM Index years in the dataframe")

There are 239 indicator names in the dataframe
There are 2 indicator types in the dataframe
There are 29 survey years in the dataframe
There are 10 INFORM Index years in the dataframe


## Splitting the dataframe into multiple dataframes based on different column categories

#### Split on Indicator Type 
- 2 categories
- Store as 2 new dataframes
- Keep original dataframe
- Export new dataframes to desktop

In [9]:
# Grouping by 'Indicator Type'
grouped = df.groupby('Indicator Type')

# Creating a dictionary to store DataFrames
dfs = {}

# Splitting and storing DataFrames based on category
for category, group_df in grouped:
    dfs[category] = group_df.copy()

# Accessing the new DataFrames and saving them
for category, new_df in dfs.items():
    print(f"DataFrame for Indicator Types {category}:")
    print(new_df)
    print()  # line to space between dataframes

DataFrame for Indicator Types Core Indicators:
       Iso3        IndicatorId  \
0       AFG             AFF_DR   
1       AGO             AFF_DR   
2       ALB             AFF_DR   
3       ARE             AFF_DR   
4       ARG             AFF_DR   
...     ...                ...   
473663  GNB  EN.POP.SLUM.UR.ZS   
473664  GTM  EN.POP.SLUM.UR.ZS   
473665  GUY  EN.POP.SLUM.UR.ZS   
473666  HTI  EN.POP.SLUM.UR.ZS   
473667  IDN  EN.POP.SLUM.UR.ZS   

                                            IndicatorName  IndicatorScore  \
0             People affected by drought (absolute) - raw   886000.000000   
1             People affected by drought (absolute) - raw   197920.457143   
2             People affected by drought (absolute) - raw    91428.571429   
3             People affected by drought (absolute) - raw        0.000000   
4             People affected by drought (absolute) - raw     1000.914286   
...                                                   ...             ...   
47366

In [18]:
import os

# Grouping by 'Indicator Type'
grouped = df.groupby('Indicator Type')

# Creating a dictionary to store DataFrames
dfs = {}

# Splitting and storing DataFrames based on category
for category, group_df in grouped:
    dfs[category] = group_df.copy()

# Define the folder path where you want to save the DataFrames
folder_path = '~/Desktop/CodeOp/DSF02/Group Project'  # Replace this with your desired folder path

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Accessing the new DataFrames and saving them
for category, new_df in dfs.items():
    # Construct the file path
    file_path = os.path.join(folder_path, f"{category}.csv")

    # Save each DataFrame as a separate CSV file in the specified folder
    new_df.to_csv(file_path, index=False)
    print(f"DataFrame for Indicator Types {category} saved as {file_path}")

DataFrame for Indicator Types Core Indicators saved as ~/Desktop/CodeOp/DSF02/Group Project\Core Indicators.csv
DataFrame for Indicator Types INORM Index saved as ~/Desktop/CodeOp/DSF02/Group Project\INORM Index.csv


In [19]:
# Accessing the new DataFrames and printing info
for category, new_df in dfs.items():
    print(f"DataFrame for Indicator Types {category}:")
    print(new_df.info())
    print()

DataFrame for Indicator Types Core Indicators:
<class 'pandas.core.frame.DataFrame'>
Index: 147420 entries, 0 to 473667
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Iso3            147420 non-null  object 
 1   IndicatorId     147420 non-null  object 
 2   IndicatorName   147420 non-null  object 
 3   IndicatorScore  147420 non-null  float64
 4   SurveyYear      147420 non-null  int64  
 5   Indicator Type  147420 non-null  object 
 6   INFORMYear      147420 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 9.0+ MB
None

DataFrame for Indicator Types INORM Index:
<class 'pandas.core.frame.DataFrame'>
Index: 326248 entries, 21 to 473143
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Iso3            326248 non-null  object 
 1   IndicatorId     326248 non-null  object 
 2   IndicatorName   325091 non-n

### Import Dataframes - Use a New Worksheet for each one...
#### Split New Dataframes on INFORMYear 
- 10 categories x 2 dataframes
- Store as (10 x 2) 20 new dataframes
- Keep original dataframes
- Export new dataframes to Project folder

In [22]:
df1 = pd.read_csv("INORM Index.csv")

In [23]:
df2 = pd.read_csv("Core Indicators.csv")

In [28]:
# Grouping by 'INFORMYear'
grouped = df1.groupby('INFORMYear')

# Creating a dictionary to store DataFrames
dfs_year = {}

# Splitting and storing DataFrames based on category
for category, group_df1 in grouped:
    dfs_year[category] = group_df1.copy()

# Accessing the new DataFrames and saving them
for category, new_df in dfs_year.items():
    print(f"DataFrame for INFORM Year {category}:")
    print(new_df)
    print()  # line to space between dataframes

DataFrame for INFORM Year 2015:
       Iso3 IndicatorId                                 IndicatorName  \
293544  AFG  AFF_DR_REL  People affected by droughts (relative) - raw   
293545  AGO  AFF_DR_REL  People affected by droughts (relative) - raw   
293546  ALB  AFF_DR_REL  People affected by droughts (relative) - raw   
293547  ARE  AFF_DR_REL  People affected by droughts (relative) - raw   
293548  ARG  AFF_DR_REL  People affected by droughts (relative) - raw   
...     ...         ...                                           ...   
326080  VCT      INFORM                             INFORM Risk Index   
326081  BLZ      INFORM                             INFORM Risk Index   
326082  KIR      INFORM                             INFORM Risk Index   
326083  LIE      INFORM                             INFORM Risk Index   
326084  TUV      INFORM                             INFORM Risk Index   

        IndicatorScore  SurveyYear Indicator Type  INFORMYear  
293544        0.568526     

In [29]:
# Grouping INFORM Index Dataframe by 'INFORM Year'
group_year = df1.groupby('INFORMYear')

# Creating a dictionary to store DataFrames
df1_year = {}

# Splitting and storing DataFrames based on category
for category, group_df in group_year:
    df1_year[category] = group_df1.copy()

# Define the folder path where you want to save the DataFrames
folder_path = '~/Desktop/CodeOp/DSF02/Group Project'  # Replace this with your desired folder path

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Accessing the new DataFrames and saving them
for category, new_df in df1_year.items():
    # Construct the file path
    file_path = os.path.join(folder_path, f"{category}.csv")

    # Save each DataFrame as a separate CSV file in the specified folder
    new_df.to_csv(file_path, index=False)
    print(f"DataFrame for INFORM Year {category} saved as {file_path}")

DataFrame for INFORM Year 2015 saved as ~/Desktop/CodeOp/DSF02/Group Project\2015.csv
DataFrame for INFORM Year 2016 saved as ~/Desktop/CodeOp/DSF02/Group Project\2016.csv
DataFrame for INFORM Year 2017 saved as ~/Desktop/CodeOp/DSF02/Group Project\2017.csv
DataFrame for INFORM Year 2018 saved as ~/Desktop/CodeOp/DSF02/Group Project\2018.csv
DataFrame for INFORM Year 2019 saved as ~/Desktop/CodeOp/DSF02/Group Project\2019.csv
DataFrame for INFORM Year 2020 saved as ~/Desktop/CodeOp/DSF02/Group Project\2020.csv
DataFrame for INFORM Year 2021 saved as ~/Desktop/CodeOp/DSF02/Group Project\2021.csv
DataFrame for INFORM Year 2022 saved as ~/Desktop/CodeOp/DSF02/Group Project\2022.csv
DataFrame for INFORM Year 2023 saved as ~/Desktop/CodeOp/DSF02/Group Project\2023.csv
DataFrame for INFORM Year 2024 saved as ~/Desktop/CodeOp/DSF02/Group Project\2024.csv


In [32]:
# Grouping by 'INFORMYear'
grouped_df2 = df2.groupby('INFORMYear')

# Creating a dictionary to store DataFrames
df2_year = {}

# Splitting and storing DataFrames based on category
for category, group_df2 in grouped:
    df2_year[category] = group_df2.copy()

# Accessing the new DataFrames and saving them
for category, new_df in df2_year.items():
    print(f"Core Indicators dataFrame for INFORM Year {category}:")
    print(new_df)
    print()  # line to space between dataframes

# Splitting and storing DataFrames based on category
for category, group_df in grouped_df2:
    df2_year[category] = group_df2.copy()

# Define the folder path where you want to save the DataFrames
folder_path = '~/Desktop/CodeOp/DSF02/Group Project'  # Replace this with your desired folder path

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Accessing the new DataFrames and saving them
for category, new_df in df2_year.items():
    # Construct the file path
    file_path = os.path.join(folder_path, f"{category}.csv")

    # Save each DataFrame as a separate CSV file in the specified folder
    new_df.to_csv(file_path, index=False)
    print(f"Core Indicators DataFrame for INFORM Year {category} saved as {file_path}")

Core Indicators dataFrame for INFORM Year 2015:
       Iso3 IndicatorId                                 IndicatorName  \
293544  AFG  AFF_DR_REL  People affected by droughts (relative) - raw   
293545  AGO  AFF_DR_REL  People affected by droughts (relative) - raw   
293546  ALB  AFF_DR_REL  People affected by droughts (relative) - raw   
293547  ARE  AFF_DR_REL  People affected by droughts (relative) - raw   
293548  ARG  AFF_DR_REL  People affected by droughts (relative) - raw   
...     ...         ...                                           ...   
326080  VCT      INFORM                             INFORM Risk Index   
326081  BLZ      INFORM                             INFORM Risk Index   
326082  KIR      INFORM                             INFORM Risk Index   
326083  LIE      INFORM                             INFORM Risk Index   
326084  TUV      INFORM                             INFORM Risk Index   

        IndicatorScore  SurveyYear Indicator Type  INFORMYear  
293544     

Core Indicators DataFrame for INFORM Year 2016 saved as ~/Desktop/CodeOp/DSF02/Group Project\2016.csv
Core Indicators DataFrame for INFORM Year 2017 saved as ~/Desktop/CodeOp/DSF02/Group Project\2017.csv
Core Indicators DataFrame for INFORM Year 2018 saved as ~/Desktop/CodeOp/DSF02/Group Project\2018.csv
Core Indicators DataFrame for INFORM Year 2019 saved as ~/Desktop/CodeOp/DSF02/Group Project\2019.csv
Core Indicators DataFrame for INFORM Year 2020 saved as ~/Desktop/CodeOp/DSF02/Group Project\2020.csv
Core Indicators DataFrame for INFORM Year 2021 saved as ~/Desktop/CodeOp/DSF02/Group Project\2021.csv
Core Indicators DataFrame for INFORM Year 2022 saved as ~/Desktop/CodeOp/DSF02/Group Project\2022.csv
Core Indicators DataFrame for INFORM Year 2023 saved as ~/Desktop/CodeOp/DSF02/Group Project\2023.csv
Core Indicators DataFrame for INFORM Year 2024 saved as ~/Desktop/CodeOp/DSF02/Group Project\2024.csv


In [None]:
#catIso = df['Iso3'].unique()
#catName = df['IndicatorName'].unique()
#catScore = df['IndicatorScore'].unique()
#catYear = df['SurveyYear'].unique()
#catType = df['Indicator Type'].unique()
#catInformYear = df['INFORMYear'].unique()

In [None]:
# This would create new columns, with dummy variables for different categories...
# Step 1: Get unique values from the column you want to split
#unique_indicator_type = df['Indicator Type'].unique()

# Step 2: Create new columns based on unique values
#for category in unique_indicator_type:
    # Create a new column for each unique category
#    df['Type'] = 0
    
# Step 3: Assign values to newly created columns, this step isn't working well... 
# check: value = row['Value']
#for index, row in df.iterrows():
#    category = row['Indicator Type']
#    value = row['Value']
#    df.at[index, category] = value

#print(df)

In [None]:
#group_df.info()