In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import pandas as pd
import numpy as np
# from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

In [4]:
# Connecting to the final database through SQL
import sqlalchemy as db
import psycopg2
from sqlalchemy import create_engine
from config import dbn, pwd

# Connect to your postgres DB
conn = psycopg2.connect(
    host="localhost",
    port = "5432",
    database= dbn,
    user="postgres",
    password= pwd)

# Open a cursor to perform database operations
cur = conn.cursor()

# Execute a query
cur.execute("SELECT * FROM migration_data")

# Retrieve query results
record = cur.fetchall()

In [5]:
# Create DF from SQL table
migration_data_df = pd.DataFrame(record, columns = ["country_id_alpha", "country_name", "year", "total_country_population", "annual_growth_rate", "country_area", "population_density", "total_fertility_rate", "crude_birth_rate", "life_expectancy_at_birth", "infant_mortality_rate", "crude_death_rate", "net_migration_rate", "migration_flag", "total_migrant_population", "migrant_male_population", "migrant_female_population"])
migration_data_df.head(10)

Unnamed: 0,country_id_alpha,country_name,year,total_country_population,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,net_migration_rate,migration_flag,total_migrant_population,migrant_male_population,migrant_female_population
0,AF,Afghanistan,1990,13568282,-1.928,652230,20.8,8.0,54.44,42.19,167.73,22.5,-51.23,False,57686.0,32558.0,25128.0
1,AF,Afghanistan,1995,19445013,3.492,652230,29.8,8.0,53.23,43.88,156.75,20.99,2.68,True,71522.0,39105.0,32417.0
2,AF,Afghanistan,2000,22461349,-1.328,652230,34.4,8.0,51.35,45.49,146.75,19.33,-45.3,False,75917.0,42848.0,33069.0
3,AF,Afghanistan,2005,26332646,3.439,652230,40.4,6.3707,41.51,47.06,137.56,16.55,9.43,True,87314.0,49281.0,38033.0
4,AF,Afghanistan,2010,29116851,2.139,652230,44.6,5.8532,39.77,48.93,127.79,15.13,-3.25,False,102276.0,57726.0,44550.0
5,AF,Afghanistan,2015,32547550,2.306,652230,49.9,5.326,38.52,50.87,118.07,13.96,-1.51,False,339432.0,171550.0,167882.0
6,AF,Afghanistan,2020,36594776,2.366,652230,56.1,4.82,36.56,52.84,108.64,12.8,-0.1,False,144098.0,69189.0,74909.0
7,AL,Albania,1990,3244925,1.11,27398,118.4,3.0884,25.72,71.16,44.28,6.01,-8.61,False,66013.0,30579.0,35434.0
8,AL,Albania,1995,3158153,1.491,27398,115.3,2.9429,23.32,71.24,50.5,6.2,-2.21,False,71354.0,33284.0,38070.0
9,AL,Albania,2000,3158351,-1.019,27398,115.3,2.1716,16.43,74.7,22.62,5.39,-21.22,False,76695.0,35990.0,40705.0


In [6]:
# Reorder columns
columns=["country_id_alpha", "country_name", "year", "annual_growth_rate", "country_area", "population_density", "total_fertility_rate", "crude_birth_rate", "life_expectancy_at_birth", "infant_mortality_rate", "crude_death_rate", "total_country_population", "total_migrant_population", "migrant_male_population", "migrant_female_population", "net_migration_rate", "migration_flag"]
migration_data_df = migration_data_df[columns]
migration_data_df.head()

Unnamed: 0,country_id_alpha,country_name,year,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population,net_migration_rate,migration_flag
0,AF,Afghanistan,1990,-1.928,652230,20.8,8.0,54.44,42.19,167.73,22.5,13568282,57686.0,32558.0,25128.0,-51.23,False
1,AF,Afghanistan,1995,3.492,652230,29.8,8.0,53.23,43.88,156.75,20.99,19445013,71522.0,39105.0,32417.0,2.68,True
2,AF,Afghanistan,2000,-1.328,652230,34.4,8.0,51.35,45.49,146.75,19.33,22461349,75917.0,42848.0,33069.0,-45.3,False
3,AF,Afghanistan,2005,3.439,652230,40.4,6.3707,41.51,47.06,137.56,16.55,26332646,87314.0,49281.0,38033.0,9.43,True
4,AF,Afghanistan,2010,2.139,652230,44.6,5.8532,39.77,48.93,127.79,15.13,29116851,102276.0,57726.0,44550.0,-3.25,False


In [7]:
# Check for NaN values.
migration_data_df.isna().sum()

country_id_alpha              0
country_name                  0
year                          0
annual_growth_rate            0
country_area                  0
population_density            0
total_fertility_rate          0
crude_birth_rate              0
life_expectancy_at_birth      0
infant_mortality_rate         0
crude_death_rate              0
total_country_population      0
total_migrant_population     48
migrant_male_population      48
migrant_female_population    48
net_migration_rate            0
migration_flag                0
dtype: int64

In [8]:
# Identify the rows that have NaN values.
migration_data_df[migration_data_df["total_migrant_population"].isna()]

Unnamed: 0,country_id_alpha,country_name,year,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population,net_migration_rate,migration_flag
335,CW,Curaçao,1995,-0.888,444,318.9,2.64,20.38,73.45,13.61,8.07,141590,,,,-21.19,False
336,CW,Curaçao,2000,-1.477,444,301.7,2.2208,15.11,75.88,10.94,7.49,133963,,,,-22.39,False
337,CW,Curaçao,2005,1.12,444,306.4,2.2316,14.07,76.19,10.63,8.01,136036,,,,5.15,True
551,GG,Guernsey,1995,-0.29,78,781.8,1.3881,10.63,77.65,3.06,10.82,60977,,,,-2.71,False
552,GG,Guernsey,2000,0.344,78,794.2,1.4698,10.67,79.4,4.78,9.46,61948,,,,2.23,True
553,GG,Guernsey,2005,0.486,78,809.4,1.4431,10.49,80.83,4.18,8.63,63131,,,,3.01,True
554,GG,Guernsey,2010,0.505,78,830.8,1.5317,10.25,82.63,3.52,7.96,64799,,,,2.76,True
555,GG,Guernsey,2015,0.351,78,848.6,1.5527,9.81,82.73,3.49,8.61,66187,,,,2.31,True
556,GG,Guernsey,2020,0.243,78,861.2,1.5738,9.78,82.84,3.47,9.23,67173,,,,1.88,True
673,JE,Jersey,1995,0.26,116,732.7,1.5,13.19,77.18,6.28,10.04,84993,,,,-0.55,False


In [9]:
# Delete all "country_id_alpha" data with NaN values.
migration_data_df = migration_data_df[migration_data_df["country_id_alpha"].str.contains("CW|GG|JE|XK|ME|BL|MF|SX|TW")==False]
migration_data_df[migration_data_df["total_migrant_population"].isna()]

Unnamed: 0,country_id_alpha,country_name,year,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population,net_migration_rate,migration_flag


In [10]:
# Change boolean values for "migration_flag" for binary values
migration_data_df["migration_flag"] = migration_data_df["migration_flag"].replace({True: 1, False: 0})

migration_data_df["total_migrant_population"] = migration_data_df["total_migrant_population"].apply(int)
migration_data_df["migrant_male_population"] = migration_data_df["migrant_male_population"].apply(int)
migration_data_df["migrant_female_population"] = migration_data_df["migrant_female_population"].apply(int)

migration_data_df.head(10)

Unnamed: 0,country_id_alpha,country_name,year,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population,net_migration_rate,migration_flag
0,AF,Afghanistan,1990,-1.928,652230,20.8,8.0,54.44,42.19,167.73,22.5,13568282,57686,32558,25128,-51.23,0
1,AF,Afghanistan,1995,3.492,652230,29.8,8.0,53.23,43.88,156.75,20.99,19445013,71522,39105,32417,2.68,1
2,AF,Afghanistan,2000,-1.328,652230,34.4,8.0,51.35,45.49,146.75,19.33,22461349,75917,42848,33069,-45.3,0
3,AF,Afghanistan,2005,3.439,652230,40.4,6.3707,41.51,47.06,137.56,16.55,26332646,87314,49281,38033,9.43,1
4,AF,Afghanistan,2010,2.139,652230,44.6,5.8532,39.77,48.93,127.79,15.13,29116851,102276,57726,44550,-3.25,0
5,AF,Afghanistan,2015,2.306,652230,49.9,5.326,38.52,50.87,118.07,13.96,32547550,339432,171550,167882,-1.51,0
6,AF,Afghanistan,2020,2.366,652230,56.1,4.82,36.56,52.84,108.64,12.8,36594776,144098,69189,74909,-0.1,0
7,AL,Albania,1990,1.11,27398,118.4,3.0884,25.72,71.16,44.28,6.01,3244925,66013,30579,35434,-8.61,0
8,AL,Albania,1995,1.491,27398,115.3,2.9429,23.32,71.24,50.5,6.2,3158153,71354,33284,38070,-2.21,0
9,AL,Albania,2000,-1.019,27398,115.3,2.1716,16.43,74.7,22.62,5.39,3158351,76695,35990,40705,-21.22,0


In [11]:
migration_data_df.dtypes

country_id_alpha              object
country_name                  object
year                           int64
annual_growth_rate           float64
country_area                   int64
population_density           float64
total_fertility_rate         float64
crude_birth_rate             float64
life_expectancy_at_birth     float64
infant_mortality_rate        float64
crude_death_rate             float64
total_country_population       int64
total_migrant_population       int64
migrant_male_population        int64
migrant_female_population      int64
net_migration_rate           float64
migration_flag                 int64
dtype: object

Split the data into Training and Testing

In [12]:
# Create our features
X = pd.get_dummies(migration_data_df.drop(columns=["migration_flag", "country_id_alpha", "country_name", "year", "country_area", "net_migration_rate"]))

# Create our target
y = migration_data_df["migration_flag"]

In [13]:
X.describe()

Unnamed: 0,annual_growth_rate,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population
count,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0
mean,1.278055,384.949824,3.031473,22.896965,69.572685,33.215934,8.417052,29692790.0,812087.6,418791.1,393296.4
std,3.967187,1762.00053,1.617946,11.612724,9.399712,32.491108,3.642378,122304000.0,2174123.0,1116949.0,1080226.0
min,-115.363,0.0,0.8796,6.63,29.47,1.54,1.2,3951.0,108.0,61.0,47.0
25%,0.423,29.1,1.7717,12.92,64.255,8.91,5.96,582110.5,28027.0,14651.5,12804.5
50%,1.237,77.3,2.45,20.04,71.68,20.7,7.63,5354669.0,152235.0,77436.0,74594.0
75%,2.3345,182.0,4.10415,31.535,76.43,48.505,10.11,18691130.0,646016.5,320120.0,305649.0
max,34.084,22332.0,8.27,57.27,89.78,180.13,32.94,1404032000.0,50632840.0,24479000.0,26153840.0


In [14]:
# Check the balance of our target values
y.value_counts()

0    803
1    616
Name: migration_flag, dtype: int64

In [15]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

Scale the data

In [16]:
scaler = StandardScaler()

In [17]:
X_scaler = scaler.fit(X_train)

In [18]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## ML model with Migration Data

Balanced Random Forest Classifier - With migration info

In [20]:
# Create the Random Forest Classifier instance
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [22]:
predictions

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,

In [23]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [24]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,179,22
Actual 1,23,131


In [26]:
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8732394366197183


In [27]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       201
           1       0.86      0.85      0.85       154

    accuracy                           0.87       355
   macro avg       0.87      0.87      0.87       355
weighted avg       0.87      0.87      0.87       355



In [28]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.26084976, 0.0541934 , 0.10127538, 0.09225324, 0.0786409 ,
       0.1140517 , 0.06210434, 0.07212841, 0.05029486, 0.06018359,
       0.05402441])

In [29]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.26084976311660923, 'annual_growth_rate'),
 (0.11405170209762115, 'infant_mortality_rate'),
 (0.1012753776064856, 'total_fertility_rate'),
 (0.09225323889743768, 'crude_birth_rate'),
 (0.07864090101069811, 'life_expectancy_at_birth'),
 (0.07212840976272065, 'total_country_population'),
 (0.06210434421989633, 'crude_death_rate'),
 (0.060183594183332814, 'migrant_male_population'),
 (0.05419339880734877, 'population_density'),
 (0.0540244078268078, 'migrant_female_population'),
 (0.05029486247104198, 'total_migrant_population')]

Deep Learning Model - With Migration info

In [30]:
#pip install tensorflow

In [32]:
# Define the checkpoint path and filenames
checkpoint_path = "checkpoints_migration_info/weights.{epoch:02d}.hdf5"

In [33]:
# Define the deep neural model
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 16
hidden_nodes_layer2 = 8

nn = tf.keras.models.Sequential()

# First hidden layer 
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics 
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath = checkpoint_path,
    verbose = 1,
    save_weights_only = True,
    save_freq = "epoch")

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[cp_callback])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
 1/34 [..............................] - ETA: 15s - loss: 0.7697 - accuracy: 0.4375
Epoch 1: saving model to checkpoints_migration_info\weights.01.hdf5
Epoch 2/100
 1/34 [..............................] - ETA: 0s - loss: 0.6826 - accuracy: 0.6250
Epoch 2: saving model to checkpoints_migration_info\weights.02.hdf5
Epoch 3/100
 1/34 [..............................] - ETA: 0s - loss: 0.5906 - accuracy: 0.7500
Epoch 3: saving model to checkpoints_migration_info\weights.03.hdf5
Epoch 4/100
 1/34 [..............................] - ETA: 0s - loss: 0.5786 - accuracy: 0.7812
Epoch 4: saving model to checkpoints_migration_info\weights.04.hdf5
Epoch 5/100
 1/34 [..............................] - ETA: 0s - loss: 0.6524 - accuracy: 0.6250
Epoch 5: saving model to checkpoints_migration_info\weights.05.hdf5
Epoch 6/100
 1/34 [..............................] - ETA: 0s - loss: 0.6091 - accuracy: 0.6562
Epoch 6: saving model to checkpoints_migration_info\weights.06.hdf5
Epoch 7/100
 1/34 [..

Epoch 34/100
 1/34 [..............................] - ETA: 0s - loss: 0.4395 - accuracy: 0.8750
Epoch 34: saving model to checkpoints_migration_info\weights.34.hdf5
Epoch 35/100
 1/34 [..............................] - ETA: 0s - loss: 0.2180 - accuracy: 0.8750
Epoch 35: saving model to checkpoints_migration_info\weights.35.hdf5
Epoch 36/100
 1/34 [..............................] - ETA: 0s - loss: 0.2569 - accuracy: 0.9688
Epoch 36: saving model to checkpoints_migration_info\weights.36.hdf5
Epoch 37/100
 1/34 [..............................] - ETA: 0s - loss: 0.2263 - accuracy: 0.8750
Epoch 37: saving model to checkpoints_migration_info\weights.37.hdf5
Epoch 38/100
 1/34 [..............................] - ETA: 0s - loss: 0.2696 - accuracy: 0.9062
Epoch 38: saving model to checkpoints_migration_info\weights.38.hdf5
Epoch 39/100
 1/34 [..............................] - ETA: 0s - loss: 0.1020 - accuracy: 1.0000
Epoch 39: saving model to checkpoints_migration_info\weights.39.hdf5
Epoch 40/1

Epoch 99/100
 1/34 [..............................] - ETA: 0s - loss: 0.1098 - accuracy: 1.0000
Epoch 99: saving model to checkpoints_migration_info\weights.99.hdf5
Epoch 100/100
 1/34 [..............................] - ETA: 0s - loss: 0.0655 - accuracy: 1.0000
Epoch 100: saving model to checkpoints_migration_info\weights.100.hdf5
12/12 - 0s - loss: 0.1403 - accuracy: 0.9352 - 111ms/epoch - 9ms/step
Loss: 0.1402605026960373, Accuracy: 0.9352112412452698


In [35]:
# Export our model to HDF5 file 
nn.save("trained_with_migration_info.h5")

## ML model with only country data

In [36]:
# Create our features
X = pd.get_dummies(migration_data_df.drop(columns=["migration_flag", "country_id_alpha", "country_name", "year", "country_area", "net_migration_rate", "migrant_male_population", "migrant_female_population", "total_migrant_population"]))

# Create our target
y = migration_data_df["migration_flag"]

In [37]:
# Split and scale the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [38]:
# Create the model.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [39]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [40]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [41]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [42]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,184,17
Actual 1,23,131


Accuracy Score : 0.8873239436619719
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       201
           1       0.89      0.85      0.87       154

    accuracy                           0.89       355
   macro avg       0.89      0.88      0.88       355
weighted avg       0.89      0.89      0.89       355



In [43]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_

# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.27182084152561564, 'annual_growth_rate'),
 (0.1334078952920348, 'infant_mortality_rate'),
 (0.1165773463172062, 'total_fertility_rate'),
 (0.11270197805241466, 'crude_birth_rate'),
 (0.11159463656838718, 'life_expectancy_at_birth'),
 (0.08910291886915628, 'crude_death_rate'),
 (0.08824921229630588, 'total_country_population'),
 (0.07654517107887948, 'population_density')]

Deep Learning Model - Only with Country Data

In [44]:
# Define the checkpoint path and filenames
checkpoint_path = "checkpoints_country_data/weights.{epoch:02d}.hdf5"

In [50]:
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 16
hidden_nodes_layer2 = 8

nn = tf.keras.models.Sequential()

# First hidden layer 
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics 
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath = checkpoint_path,
    verbose = 1,
    save_weights_only = True,
    save_freq = "epoch")

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[cp_callback])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
 1/34 [..............................] - ETA: 13s - loss: 0.6374 - accuracy: 0.6562
Epoch 1: saving model to checkpoints_country_data\weights.01.hdf5
Epoch 2/100
 1/34 [..............................] - ETA: 0s - loss: 0.5946 - accuracy: 0.7500
Epoch 2: saving model to checkpoints_country_data\weights.02.hdf5
Epoch 3/100
 1/34 [..............................] - ETA: 0s - loss: 0.6900 - accuracy: 0.6250
Epoch 3: saving model to checkpoints_country_data\weights.03.hdf5
Epoch 4/100
 1/34 [..............................] - ETA: 0s - loss: 0.5795 - accuracy: 0.6562
Epoch 4: saving model to checkpoints_country_data\weights.04.hdf5
Epoch 5/100
 1/34 [..............................] - ETA: 0s - loss: 0.5721 - accuracy: 0.6562
Epoch 5: saving model to checkpoints_country_data\weights.05.hdf5
Epoch 6/100
 1/34 [..............................] - ETA: 0s - loss: 0.5914 - accuracy: 0.7188
Epoch 6: saving model to checkpoints_country_data\weights.06.hdf5
Epoch 7/100
 1/34 [..............

Epoch 34/100
 1/34 [..............................] - ETA: 0s - loss: 0.2741 - accuracy: 0.9062
Epoch 34: saving model to checkpoints_country_data\weights.34.hdf5
Epoch 35/100
 1/34 [..............................] - ETA: 0s - loss: 0.1358 - accuracy: 1.0000
Epoch 35: saving model to checkpoints_country_data\weights.35.hdf5
Epoch 36/100
 1/34 [..............................] - ETA: 0s - loss: 0.1719 - accuracy: 1.0000
Epoch 36: saving model to checkpoints_country_data\weights.36.hdf5
Epoch 37/100
 1/34 [..............................] - ETA: 0s - loss: 0.2328 - accuracy: 0.8438
Epoch 37: saving model to checkpoints_country_data\weights.37.hdf5
Epoch 38/100
 1/34 [..............................] - ETA: 0s - loss: 0.2209 - accuracy: 0.9062
Epoch 38: saving model to checkpoints_country_data\weights.38.hdf5
Epoch 39/100
 1/34 [..............................] - ETA: 0s - loss: 0.2308 - accuracy: 0.9375
Epoch 39: saving model to checkpoints_country_data\weights.39.hdf5
Epoch 40/100
 1/34 [..

Epoch 67/100
 1/34 [..............................] - ETA: 0s - loss: 0.1339 - accuracy: 1.0000
Epoch 67: saving model to checkpoints_country_data\weights.67.hdf5
Epoch 68/100
 1/34 [..............................] - ETA: 0s - loss: 0.1911 - accuracy: 0.9375
Epoch 68: saving model to checkpoints_country_data\weights.68.hdf5
Epoch 69/100
 1/34 [..............................] - ETA: 0s - loss: 0.1630 - accuracy: 0.9062
Epoch 69: saving model to checkpoints_country_data\weights.69.hdf5
Epoch 70/100
 1/34 [..............................] - ETA: 0s - loss: 0.1646 - accuracy: 0.9062
Epoch 70: saving model to checkpoints_country_data\weights.70.hdf5
Epoch 71/100
 1/34 [..............................] - ETA: 0s - loss: 0.1165 - accuracy: 0.9688
Epoch 71: saving model to checkpoints_country_data\weights.71.hdf5
Epoch 72/100
 1/34 [..............................] - ETA: 0s - loss: 0.0345 - accuracy: 1.0000
Epoch 72: saving model to checkpoints_country_data\weights.72.hdf5
Epoch 73/100
 1/34 [..

Epoch 100/100
 1/34 [..............................] - ETA: 0s - loss: 0.1553 - accuracy: 0.9375
Epoch 100: saving model to checkpoints_country_data\weights.100.hdf5
12/12 - 0s - loss: 0.1082 - accuracy: 0.9662 - 85ms/epoch - 7ms/step
Loss: 0.10815860331058502, Accuracy: 0.9661971926689148


In [52]:
# Export our model to HDF5 file 
nn.save("trained_with_country_data.h5")

### Connect to the Validation Database through SQL

In [6]:
# Connect to your postgres DB
conn = psycopg2.connect(
    host="localhost",
    port = "5432",
    database= dbn,
    user="postgres",
    password= pwd)

# Open a cursor to perform database operations
cur = conn.cursor()

# Execute a query
cur.execute("SELECT * FROM validation_data")

# Retrieve query results
records = cur.fetchall()

In [7]:
validation_data = pd.DataFrame(records, columns = ["country_id_alpha", "country_name", "year",
                                                          "total_country_population", "annual_growth_rate", "population_density",
                                                          "total_fertility_rate", "crude_birth_rate", "life_expectancy_at_birth",
                                                          "infant_mortality_rate", "crude_death_rate", "net_migration_rate",
                                                          "migration_flag"])
validation_data.head()

Unnamed: 0,country_id_alpha,country_name,year,total_country_population,annual_growth_rate,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,net_migration_rate,migration_flag
0,AF,Afghanistan,2022,38346720,2.304,58.8,4.624,35.46,53.65,104.89,12.33,-0.1,False
1,AL,Albania,2022,3095344,0.215,113.0,1.5402,12.69,79.47,10.82,7.31,-3.23,False
2,DZ,Algeria,2022,44178884,1.337,18.5,2.5058,18.52,78.03,19.72,4.32,-0.82,False
3,AS,American Samoa,2022,45443,-1.919,229.5,2.206,16.7,75.32,10.06,6.1,-29.8,False
4,AD,Andorra,2022,85560,-0.104,182.8,1.4474,6.88,83.42,3.44,7.92,0.0,True


In [8]:
# Change boolean values for "migration_flag" for binary values
validation_data["migration_flag"] = validation_data["migration_flag"].replace({True: 1, False: 0})
validation_data.head()

Unnamed: 0,country_id_alpha,country_name,year,total_country_population,annual_growth_rate,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,net_migration_rate,migration_flag
0,AF,Afghanistan,2022,38346720,2.304,58.8,4.624,35.46,53.65,104.89,12.33,-0.1,0
1,AL,Albania,2022,3095344,0.215,113.0,1.5402,12.69,79.47,10.82,7.31,-3.23,0
2,DZ,Algeria,2022,44178884,1.337,18.5,2.5058,18.52,78.03,19.72,4.32,-0.82,0
3,AS,American Samoa,2022,45443,-1.919,229.5,2.206,16.7,75.32,10.06,6.1,-29.8,0
4,AD,Andorra,2022,85560,-0.104,182.8,1.4474,6.88,83.42,3.44,7.92,0.0,1


In [9]:
# Making sure data was imported correctly.
validation_data.dtypes

country_id_alpha             object
country_name                 object
year                          int64
total_country_population      int64
annual_growth_rate          float64
population_density          float64
total_fertility_rate        float64
crude_birth_rate            float64
life_expectancy_at_birth    float64
infant_mortality_rate       float64
crude_death_rate            float64
net_migration_rate          float64
migration_flag                int64
dtype: object

In [10]:
validation_data.isna().sum()

country_id_alpha            0
country_name                0
year                        0
total_country_population    0
annual_growth_rate          0
population_density          0
total_fertility_rate        0
crude_birth_rate            0
life_expectancy_at_birth    0
infant_mortality_rate       0
crude_death_rate            0
net_migration_rate          0
migration_flag              0
dtype: int64

In [11]:
validation_data.count()

country_id_alpha            227
country_name                227
year                        227
total_country_population    227
annual_growth_rate          227
population_density          227
total_fertility_rate        227
crude_birth_rate            227
life_expectancy_at_birth    227
infant_mortality_rate       227
crude_death_rate            227
net_migration_rate          227
migration_flag              227
dtype: int64

## Applied ML model 

In [12]:
# Prepare the data for the model - reorder
columns=["country_id_alpha", "country_name", "year", "annual_growth_rate", "population_density", "total_fertility_rate", "crude_birth_rate", "life_expectancy_at_birth", "infant_mortality_rate", "crude_death_rate", "total_country_population", "net_migration_rate", "migration_flag"]
validation_data = validation_data[columns]
validation_data

Unnamed: 0,country_id_alpha,country_name,year,annual_growth_rate,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,net_migration_rate,migration_flag
0,AF,Afghanistan,2022,2.304,58.8,4.6240,35.46,53.65,104.89,12.33,38346720,-0.10,0
1,AL,Albania,2022,0.215,113.0,1.5402,12.69,79.47,10.82,7.31,3095344,-3.23,0
2,DZ,Algeria,2022,1.337,18.5,2.5058,18.52,78.03,19.72,4.32,44178884,-0.82,0
3,AS,American Samoa,2022,-1.919,229.5,2.2060,16.70,75.32,10.06,6.10,45443,-29.80,0
4,AD,Andorra,2022,-0.104,182.8,1.4474,6.88,83.42,3.44,7.92,85560,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,WF,Wallis and Futuna,2022,0.252,111.9,1.7099,12.27,80.67,4.00,5.79,15891,-3.96,0
223,XW,West Bank,2022,1.693,531.9,2.9620,24.42,76.38,15.29,3.40,3000021,-4.09,0
224,YE,Yemen,2022,1.883,58.7,3.0080,24.64,67.51,46.54,5.62,30984689,-0.19,0
225,ZM,Zambia,2022,2.898,26.4,4.5595,34.86,66.26,37.11,6.12,19642123,0.24,1


In [13]:
# Create our features
X = pd.get_dummies(validation_data.drop(columns=["migration_flag", "country_id_alpha", "country_name", "year", "net_migration_rate"]))

# Create our target
y = validation_data["migration_flag"]

In [14]:
# Split and scale the data.
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

scaler = StandardScaler()

X_scaler = scaler.fit(X)

#X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X)

In [15]:
# Import the ML model trained with country data only
nn_imported = tf.keras.models.load_model("trained_with_country_data.h5")

In [16]:
# Evaluate the completed model using the test data
model_loss, model_accuracy = nn_imported.evaluate(X_test_scaled, y, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

8/8 - 0s - loss: 5.3802 - accuracy: 0.5463 - 237ms/epoch - 30ms/step
Loss: 5.380215644836426, Accuracy: 0.5462555289268494
