In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import pandas as pd
import numpy as np
# from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced

### Now, we need to test our model on new data. For this, we have to transform the "Validation_Data.csv" columns.

In [4]:
# Load in the dataset.
filepath = "https://raw.githubusercontent.com/pavlarsen/Final_Project/main/Resources/Data/Validation_Data.csv"
validation_df = pd.read_csv(filepath, on_bad_lines = "skip").drop(["Row", "FIPS", "Area in Square Kilometers"], axis = 1)
validation_df.head()

Unnamed: 0,GENC,Country/Area Name,Year,Population,Annual Growth Rate %,Population Density (People per Sq. Km.),Total Fertility Rate,Crude Birth Rate,"Life Expectancy at Birth, Both Sexes","Infant Mortality Rate, Both Sexes",Crude Death Rate,Net Migration Rate
0,AF,Afghanistan,2022,38346720,2.304,58.8,4.624,35.46,53.65,104.89,12.33,-0.1
1,AL,Albania,2022,3095344,0.215,113.0,1.5402,12.69,79.47,10.82,7.31,-3.23
2,DZ,Algeria,2022,44178884,1.337,18.5,2.5058,18.52,78.03,19.72,4.32,-0.82
3,AS,American Samoa,2022,45443,-1.919,229.5,2.206,16.7,75.32,10.06,6.1,-29.8
4,AD,Andorra,2022,85560,-0.104,182.8,1.4474,6.88,83.42,3.44,7.92,0.0


In [5]:
# Rename the columns.
renamed_df = validation_df.rename(columns = {"GENC": "Country_ID_Alpha", "Country/Area Name": "Country_Name",
                                             "Population": "Total_Country_Population", "Annual Growth Rate %" : "Annual_Growth_Rate", 
                                             "Population Density (People per Sq. Km.)" : "Population_Density", "Total Fertility Rate": "Total_Fertility_Rate",
                                             "Crude Birth Rate" : "Crude_Birth_Rate", "Life Expectancy at Birth, Both Sexes" : "Life_Expectancy_at_Birth",
                                             "Infant Mortality Rate, Both Sexes" : "Infant_Mortality_Rate", "Crude Death Rate" : "Crude_Death_Rate",
                                             "Net Migration Rate" : "Net_Migration_Rate"})
renamed_df.head()

Unnamed: 0,Country_ID_Alpha,Country_Name,Year,Total_Country_Population,Annual_Growth_Rate,Population_Density,Total_Fertility_Rate,Crude_Birth_Rate,Life_Expectancy_at_Birth,Infant_Mortality_Rate,Crude_Death_Rate,Net_Migration_Rate
0,AF,Afghanistan,2022,38346720,2.304,58.8,4.624,35.46,53.65,104.89,12.33,-0.1
1,AL,Albania,2022,3095344,0.215,113.0,1.5402,12.69,79.47,10.82,7.31,-3.23
2,DZ,Algeria,2022,44178884,1.337,18.5,2.5058,18.52,78.03,19.72,4.32,-0.82
3,AS,American Samoa,2022,45443,-1.919,229.5,2.206,16.7,75.32,10.06,6.1,-29.8
4,AD,Andorra,2022,85560,-0.104,182.8,1.4474,6.88,83.42,3.44,7.92,0.0


In [6]:
# Add boolean column based on "Net_Migration_Rate"
renamed_df["Migration_Flag"] = np.where(renamed_df["Net_Migration_Rate"] >= 0, 1, 0)
renamed_df.head()

Unnamed: 0,Country_ID_Alpha,Country_Name,Year,Total_Country_Population,Annual_Growth_Rate,Population_Density,Total_Fertility_Rate,Crude_Birth_Rate,Life_Expectancy_at_Birth,Infant_Mortality_Rate,Crude_Death_Rate,Net_Migration_Rate,Migration_Flag
0,AF,Afghanistan,2022,38346720,2.304,58.8,4.624,35.46,53.65,104.89,12.33,-0.1,0
1,AL,Albania,2022,3095344,0.215,113.0,1.5402,12.69,79.47,10.82,7.31,-3.23,0
2,DZ,Algeria,2022,44178884,1.337,18.5,2.5058,18.52,78.03,19.72,4.32,-0.82,0
3,AS,American Samoa,2022,45443,-1.919,229.5,2.206,16.7,75.32,10.06,6.1,-29.8,0
4,AD,Andorra,2022,85560,-0.104,182.8,1.4474,6.88,83.42,3.44,7.92,0.0,1


In [7]:
renamed_df.dtypes

Country_ID_Alpha             object
Country_Name                 object
Year                          int64
Total_Country_Population      int64
Annual_Growth_Rate          float64
Population_Density          float64
Total_Fertility_Rate        float64
Crude_Birth_Rate            float64
Life_Expectancy_at_Birth    float64
Infant_Mortality_Rate       float64
Crude_Death_Rate            float64
Net_Migration_Rate          float64
Migration_Flag                int32
dtype: object

In [8]:
# Again, replace Namibia's country code. 
renamed_df[renamed_df["Country_ID_Alpha"].isna()]


Unnamed: 0,Country_ID_Alpha,Country_Name,Year,Total_Country_Population,Annual_Growth_Rate,Population_Density,Total_Fertility_Rate,Crude_Birth_Rate,Life_Expectancy_at_Birth,Infant_Mortality_Rate,Crude_Death_Rate,Net_Migration_Rate,Migration_Flag
140,,Namibia,2022,2727409,1.816,3.3,2.982,25.01,66.47,29.42,6.85,0.0,1


In [9]:
# Replace the nan values with the country code for Namibia ("NA")
complete_df = renamed_df.copy()
complete_df = complete_df.fillna("NA")
complete_df.isna().sum()

Country_ID_Alpha            0
Country_Name                0
Year                        0
Total_Country_Population    0
Annual_Growth_Rate          0
Population_Density          0
Total_Fertility_Rate        0
Crude_Birth_Rate            0
Life_Expectancy_at_Birth    0
Infant_Mortality_Rate       0
Crude_Death_Rate            0
Net_Migration_Rate          0
Migration_Flag              0
dtype: int64

In [10]:
# Export the DataFrame
complete_df.to_csv("Validation_Data_Final.csv", index=False)