In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import pandas as pd
import numpy as np
from google.colab import files
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load in the migration_data Dataset
path = "https://raw.githubusercontent.com/pavlarsen/Final_Project/main/migration_Data.csv"
migration_data_df = pd.read_csv(path, on_bad_lines='skip')
migration_data_df.head(10)

Unnamed: 0,country_id_alpha,country_name,year,total_country_population,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,net_migration_rate,migration_flag,total_migrant_population,migrant_male_population,migrant_female_population
0,AF,Afghanistan,1990,13568282,-1.928,652230,20.8,8.0,54.44,42.19,167.73,22.5,-51.23,f,57686,32558,25128
1,AF,Afghanistan,1995,19445013,3.492,652230,29.8,8.0,53.23,43.88,156.75,20.99,2.68,t,71522,39105,32417
2,AF,Afghanistan,2000,22461349,-1.328,652230,34.4,8.0,51.35,45.49,146.75,19.33,-45.3,f,75917,42848,33069
3,AF,Afghanistan,2005,26332646,3.439,652230,40.4,6.3707,41.51,47.06,137.56,16.55,9.43,t,87314,49281,38033
4,AF,Afghanistan,2010,29116851,2.139,652230,44.6,5.8532,39.77,48.93,127.79,15.13,-3.25,f,102276,57726,44550
5,AF,Afghanistan,2015,32547550,2.306,652230,49.9,5.326,38.52,50.87,118.07,13.96,-1.51,f,339432,171550,167882
6,AF,Afghanistan,2020,36594776,2.366,652230,56.1,4.82,36.56,52.84,108.64,12.8,-0.1,f,144098,69189,74909
7,AL,Albania,1990,3244925,1.11,27398,118.4,3.0884,25.72,71.16,44.28,6.01,-8.61,f,66013,30579,35434
8,AL,Albania,1995,3158153,1.491,27398,115.3,2.9429,23.32,71.24,50.5,6.2,-2.21,f,71354,33284,38070
9,AL,Albania,2000,3158351,-1.019,27398,115.3,2.1716,16.43,74.7,22.62,5.39,-21.22,f,76695,35990,40705


In [5]:
# Reorder columns
columns=["country_id_alpha", "country_name", "year", "annual_growth_rate", "country_area", "population_density", "total_fertility_rate", "crude_birth_rate", "life_expectancy_at_birth", "infant_mortality_rate", "crude_death_rate", "total_country_population", "total_migrant_population", "migrant_male_population", "migrant_female_population", "net_migration_rate", "migration_flag"]
migration_data_df = migration_data_df[columns]
migration_data_df.head()

Unnamed: 0,country_id_alpha,country_name,year,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population,net_migration_rate,migration_flag
0,AF,Afghanistan,1990,-1.928,652230,20.8,8.0,54.44,42.19,167.73,22.5,13568282,57686,32558,25128,-51.23,f
1,AF,Afghanistan,1995,3.492,652230,29.8,8.0,53.23,43.88,156.75,20.99,19445013,71522,39105,32417,2.68,t
2,AF,Afghanistan,2000,-1.328,652230,34.4,8.0,51.35,45.49,146.75,19.33,22461349,75917,42848,33069,-45.3,f
3,AF,Afghanistan,2005,3.439,652230,40.4,6.3707,41.51,47.06,137.56,16.55,26332646,87314,49281,38033,9.43,t
4,AF,Afghanistan,2010,2.139,652230,44.6,5.8532,39.77,48.93,127.79,15.13,29116851,102276,57726,44550,-3.25,f


In [6]:
# Change boolean values for "migration_flag" for binary values
migration_data_df["migration_flag"] = migration_data_df["migration_flag"].replace({"t": 1, "f": 0})
migration_data_df.head()

Unnamed: 0,country_id_alpha,country_name,year,annual_growth_rate,country_area,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population,net_migration_rate,migration_flag
0,AF,Afghanistan,1990,-1.928,652230,20.8,8.0,54.44,42.19,167.73,22.5,13568282,57686,32558,25128,-51.23,0
1,AF,Afghanistan,1995,3.492,652230,29.8,8.0,53.23,43.88,156.75,20.99,19445013,71522,39105,32417,2.68,1
2,AF,Afghanistan,2000,-1.328,652230,34.4,8.0,51.35,45.49,146.75,19.33,22461349,75917,42848,33069,-45.3,0
3,AF,Afghanistan,2005,3.439,652230,40.4,6.3707,41.51,47.06,137.56,16.55,26332646,87314,49281,38033,9.43,1
4,AF,Afghanistan,2010,2.139,652230,44.6,5.8532,39.77,48.93,127.79,15.13,29116851,102276,57726,44550,-3.25,0


In [7]:
migration_data_df.dtypes

country_id_alpha              object
country_name                  object
year                           int64
annual_growth_rate           float64
country_area                   int64
population_density           float64
total_fertility_rate         float64
crude_birth_rate             float64
life_expectancy_at_birth     float64
infant_mortality_rate        float64
crude_death_rate             float64
total_country_population       int64
total_migrant_population       int64
migrant_male_population        int64
migrant_female_population      int64
net_migration_rate           float64
migration_flag                 int64
dtype: object

Split the data into Training and Testing

In [8]:
# Create our features
X = pd.get_dummies(migration_data_df.drop(columns=["migration_flag", "country_id_alpha", "country_name", "year", "country_area", "net_migration_rate"]))

# Create our target
y = migration_data_df["migration_flag"]

In [9]:
X.describe()

Unnamed: 0,annual_growth_rate,population_density,total_fertility_rate,crude_birth_rate,life_expectancy_at_birth,infant_mortality_rate,crude_death_rate,total_country_population,total_migrant_population,migrant_male_population,migrant_female_population
count,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0
mean,1.278055,384.949824,3.031473,22.896965,69.572685,33.215934,8.417052,29692790.0,812087.6,418791.1,393296.4
std,3.967187,1762.00053,1.617946,11.612724,9.399712,32.491108,3.642378,122304000.0,2174123.0,1116949.0,1080226.0
min,-115.363,0.0,0.8796,6.63,29.47,1.54,1.2,3951.0,108.0,61.0,47.0
25%,0.423,29.1,1.7717,12.92,64.255,8.91,5.96,582110.5,28027.0,14651.5,12804.5
50%,1.237,77.3,2.45,20.04,71.68,20.7,7.63,5354669.0,152235.0,77436.0,74594.0
75%,2.3345,182.0,4.10415,31.535,76.43,48.505,10.11,18691130.0,646016.5,320120.0,305649.0
max,34.084,22332.0,8.27,57.27,89.78,180.13,32.94,1404032000.0,50632840.0,24479000.0,26153840.0


In [10]:
# Check the balance of our target values
y.value_counts()

0    803
1    616
Name: migration_flag, dtype: int64

In [11]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

Scale the data

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [13]:
X_scaler = scaler.fit(X_train)

In [14]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Balanced Random Forest Classifier - With migration info

In [15]:
# Create the Random Forest Classifier instance
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [17]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [18]:
predictions

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,

In [19]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [20]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [21]:
# Displaying results
print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,179,22
Actual 1,23,131


In [22]:
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8732394366197183


In [23]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       201
           1       0.86      0.85      0.85       154

    accuracy                           0.87       355
   macro avg       0.87      0.87      0.87       355
weighted avg       0.87      0.87      0.87       355



In [24]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.26084976, 0.0541934 , 0.10127538, 0.09225324, 0.0786409 ,
       0.1140517 , 0.06210434, 0.07212841, 0.05029486, 0.06018359,
       0.05402441])

In [25]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.26084976311660923, 'annual_growth_rate'),
 (0.11405170209762115, 'infant_mortality_rate'),
 (0.1012753776064856, 'total_fertility_rate'),
 (0.09225323889743768, 'crude_birth_rate'),
 (0.07864090101069811, 'life_expectancy_at_birth'),
 (0.07212840976272065, 'total_country_population'),
 (0.06210434421989633, 'crude_death_rate'),
 (0.060183594183332814, 'migrant_male_population'),
 (0.05419339880734877, 'population_density'),
 (0.0540244078268078, 'migrant_female_population'),
 (0.05029486247104198, 'total_migrant_population')]

Deep Learning Model - With Migration info

In [26]:
# Import dependencies
import tensorflow as tf

In [27]:
# Define the deep neural model
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 16
hidden_nodes_layer2 = 8

nn = tf.keras.models.Sequential()

# First hidden layer 
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics 
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Test the model without any migration data, just country data.

In [28]:
# Create our features
X = pd.get_dummies(migration_data_df.drop(columns=["migration_flag", "country_id_alpha", "country_name", "year", "country_area", "net_migration_rate", "migrant_male_population", "migrant_female_population", "total_migrant_population"]))

# Create our target
y = migration_data_df["migration_flag"]

In [29]:
# Split and scale the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
# Create the model.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [31]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [32]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [33]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [34]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,184,17
Actual 1,23,131


Accuracy Score : 0.8873239436619719
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       201
           1       0.89      0.85      0.87       154

    accuracy                           0.89       355
   macro avg       0.89      0.88      0.88       355
weighted avg       0.89      0.89      0.89       355



In [35]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_

# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.27182084152561564, 'annual_growth_rate'),
 (0.1334078952920348, 'infant_mortality_rate'),
 (0.1165773463172062, 'total_fertility_rate'),
 (0.11270197805241466, 'crude_birth_rate'),
 (0.11159463656838718, 'life_expectancy_at_birth'),
 (0.08910291886915628, 'crude_death_rate'),
 (0.08824921229630588, 'total_country_population'),
 (0.07654517107887948, 'population_density')]

Deep Learning Model - Only with Country Data

In [36]:
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 16
hidden_nodes_layer2 = 8

nn = tf.keras.models.Sequential()

# First hidden layer 
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics 
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78