In [1]:
import pandas as pd
import numpy as np

## drop unwanted features and check correlations

In [2]:
df = pd.read_csv("../data/clean/full_set.csv")

# drop time related features
df.drop(["Time","MidPeriod","year"], axis=1, inplace=True)

# drop country identifying features
df.drop(["Country","country","LocID","Code"], axis=1, inplace=True)

# drop highly correlated variables (to RelMigrations)
df.drop(["CNMR","NetMigrations"], axis=1, inplace=True)

# drop useless columns (?)
df.drop(["change_from_previous_year","Unnamed: 0"], axis=1, inplace=True)

In [3]:
df.corrwith(df.RelMigrations).sort_values(ascending=False)

RelMigrations                   1.000000
GrowthRate                      0.705031
PopMale                         0.658954
PopMale_20-59                   0.629584
PopTotal_20-59                  0.448237
LExMale                         0.257140
ranking                         0.241839
LEx                             0.223727
LExFemale                       0.195654
MAC                             0.184813
PopMale_60+                    -0.017935
PopTotal                       -0.023673
PopTotal_60+                   -0.047207
SRB                            -0.053567
PopFemale_60+                  -0.068172
NatIncr                        -0.076701
human_rights                   -0.090040
state_legitimacy               -0.123657
NRR                            -0.131922
PopFemale_20-59                -0.136321
TFR                            -0.137589
factionalized_elites           -0.139101
CBR                            -0.165699
Q5                             -0.173873
Births          

## check model performance

In [4]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_validate

In [5]:
X = df.drop("RelMigrations", axis=1)
y = df.RelMigrations

In [6]:
clf = ExtraTreesRegressor(n_estimators=50)
scores = cross_validate(clf, X, y, scoring="neg_mean_absolute_error", cv=10)
print("mean absolute error: " + str(scores["test_score"].mean() * -1))
print("standard deviatoin: " + str(scores["test_score"].std()))

mean absolute error: 0.0019889440408838665
standard deviatoin: 0.00044103655030360033


## calculate feature importances

In [7]:
clf = clf.fit(X, y)

feature_importances = pd.DataFrame(clf.feature_importances_).T
feature_importances.columns = X.columns
feature_importances = feature_importances.T
feature_importances.columns = ["rel_feature_importances"]
feature_importances.sort_values(by="rel_feature_importances", ascending=False)

Unnamed: 0,rel_feature_importances
GrowthRate,0.466572
PopMale_20-59,0.063648
PopMale,0.059837
PopFemale,0.043333
DeathsFemale,0.033641
Deaths,0.031074
CDR,0.023679
DeathsMale,0.02337
PopFemale_20-59,0.02311
PopTotal_20-59,0.020974
