In [1]:
import pandas as pd
import numpy as np

## drop unwanted features and check correlations

In [2]:
df = pd.read_csv("../data/clean/full_set.csv")

# drop time related features
df.drop(["Time","MidPeriod","year"], axis=1, inplace=True)

# drop country identifying features
df.drop(["Country","country","LocID","Code"], axis=1, inplace=True)

# drop highly correlated variables (to RelMigrations)
df.drop(["CNMR","NetMigrations"], axis=1, inplace=True)

# drop useless columns (?)
df.drop(["change_from_previous_year","Unnamed: 0"], axis=1, inplace=True)

In [50]:
df.corrwith(df.RelMigrations).sort_values(ascending=False)

RelMigrations                   1.000000
GrowthRate                      0.705031
LExMale                         0.257140
ranking                         0.241839
LEx                             0.223727
LExFemale                       0.195654
MAC                             0.184813
PopMale_60+                    -0.013790
PopTotal_60+                   -0.014097
PopFemale_60+                  -0.014373
PopMale_20-59                  -0.018043
PopTotal_20-59                 -0.020149
PopMale                        -0.022305
PopFemale_20-59                -0.022382
PopTotal                       -0.023673
PopFemale                      -0.025127
DeathsMale                     -0.026528
Deaths                         -0.026591
DeathsFemale                   -0.026638
PopMale_0-19                   -0.031358
PopTotal_0-19                  -0.032089
PopFemale_0-19                 -0.032900
Births                         -0.035124
SRB                            -0.053567
NatIncr         

## check model performance

In [4]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_validate

In [39]:
X = df.drop("RelMigrations", axis=1)
y = df.RelMigrations

In [41]:
clf = ExtraTreesRegressor(n_estimators=50)
scores = cross_validate(clf, X, y, scoring="neg_mean_absolute_error", cv=10)
print("mean absolute error: " + str(scores["test_score"].mean() * -1))
print("standard deviatoin: " + str(scores["test_score"].std()))

mean absolute error: 0.0019577417592910344
standard deviatoin: 0.0004592912060315236


## calculate feature importances

In [42]:
clf = clf.fit(X, y)

feature_importances = pd.DataFrame(clf.feature_importances_).T
feature_importances.columns = X.columns
feature_importances = feature_importances.T
feature_importances.columns = ["rel_feature_importances"]
feature_importances.sort_values(by="rel_feature_importances", ascending=False)

Unnamed: 0,rel_feature_importances
GrowthRate,0.660385
CDR,0.061557
human_flight_and_brain_drain,0.043512
economy,0.024304
CBR,0.020349
NatIncr,0.015565
NRR,0.013532
LExMale,0.01281
Q5,0.010642
LEx,0.00912
