In [1]:
# !pip install xgboost

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [3]:
disp_data = pd.read_csv("displacement_data.csv")
se_data = pd.read_csv("socio_economic_data.csv")

In [4]:
print(se_data.head())
print(disp_data.head())

  iso3  year  Gender Inequality Index  Gross National Income Per Capita  \
0  BGD  2013                    0.578                       4932.186405   
1  BGD  2014                    0.569                       5205.479375   
2  BGD  2015                    0.563                       5582.619597   
3  BGD  2016                    0.557                       5987.015851   
4  BGD  2017                    0.541                       6228.122571   

   Income Inequality  Life Expectancy at Birth     Country  \
0          30.749958                    69.487  Bangladesh   
1          30.749958                    70.016  Bangladesh   
2          30.749958                    70.543  Bangladesh   
3          35.719498                    71.075  Bangladesh   
4          35.719498                    71.606  Bangladesh   

   population_density  infant_mortality_rate  employment_rate  
0         1202.520865                   32.7           54.649  
1         1213.527917                   31.4    

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Merge socio-economic indicators with displacement events
event_data = disp_data.merge(
    se_data, 
    left_on=['iso3','year'], 
    right_on=['iso3','year'],
    how='left'
)

# Convert start_date to datetime and extract month/quarter
event_data['start_date'] = pd.to_datetime(event_data['start_date'])
event_data['month'] = event_data['start_date'].dt.month
event_data['quarter'] = event_data['start_date'].dt.quarter

# Log-transform skewed numeric features
log_features = ['population_density', 'infant_mortality_rate', 'Gross National Income Per Capita', 'new_displacement']
for col in log_features:
    event_data[f'log_{col}'] = np.log1p(event_data[col])

# Normalize numeric features (excluding categorical)
num_cols = ['log_population_density','log_infant_mortality_rate','log_Gross National Income Per Capita',
            'Gender Inequality Index','Income Inequality','Life Expectancy at Birth','employment_rate',
            'month','quarter','log_new_displacement']

scaler = StandardScaler()
event_data[num_cols] = scaler.fit_transform(event_data[num_cols])

# Encode categorical columns
cat_cols = ['iso3','Country','hazard_type_name']
for col in cat_cols:
    le = LabelEncoder()
    event_data[col] = le.fit_transform(event_data[col])

# Prepare final event-level dataset
event_data_ready = event_data[[
    'iso3','year','Country','hazard_type_name','log_new_displacement',
    'Gender Inequality Index','Income Inequality','Life Expectancy at Birth',
    'employment_rate','log_population_density','log_infant_mortality_rate','log_Gross National Income Per Capita',
    'month','quarter'
]]

event_data_ready.head()

Unnamed: 0,iso3,year,Country,hazard_type_name,log_new_displacement,Gender Inequality Index,Income Inequality,Life Expectancy at Birth,employment_rate,log_population_density,log_infant_mortality_rate,log_Gross National Income Per Capita,month,quarter
0,0,2013,0,4,1.280488,1.676355,0.713646,-0.623002,-0.636378,2.838393,1.120107,-1.534837,0.144401,0.453935
1,0,2013,0,4,1.45627,1.676355,0.713646,-0.623002,-0.636378,2.838393,1.120107,-1.534837,-0.156373,-0.473391
2,0,2013,0,6,1.773007,1.676355,0.713646,-0.623002,-0.636378,2.838393,1.120107,-1.534837,-1.058695,-1.400718
3,0,2013,0,6,2.895496,1.676355,0.713646,-0.623002,-0.636378,2.838393,1.120107,-1.534837,-0.457147,-0.473391
4,3,2013,4,5,0.043799,-1.300793,0.827858,1.082569,0.676215,-0.380525,-0.713485,0.139876,0.144401,0.453935


In [6]:
# Count events per hazard type per country-year
hazard_counts = disp_data.pivot_table(
    index=['iso3','year'],
    columns='hazard_type_name',
    values='new_displacement',
    aggfunc='count',
    fill_value=0
).reset_index()

# Total number of events
hazard_counts['total_events'] = hazard_counts.drop(columns=['iso3','year']).sum(axis=1)

# Total displacement per country-year
total_disp = disp_data.groupby(['iso3','year'])['new_displacement'].sum().reset_index()
total_disp.rename(columns={'new_displacement':'total_displacement'}, inplace=True)

# Merge socio-economic indicators
yearly_data = se_data.merge(total_disp, on=['iso3','year'], how='left')
yearly_data = yearly_data.merge(hazard_counts, on=['iso3','year'], how='left')

# Fill NaN for countries-years with zero events
yearly_data['total_displacement'] = yearly_data['total_displacement'].fillna(0)
yearly_data['total_events'] = yearly_data['total_events'].fillna(0)
hazard_types = disp_data['hazard_type_name'].unique()
for h in hazard_types:
    yearly_data[h] = yearly_data[h].fillna(0)

# Log-transform numeric features including total_displacement and some socio-economic variables
log_features_yearly = ['population_density','infant_mortality_rate','Gross National Income Per Capita','total_displacement']
for col in log_features_yearly:
    yearly_data[f'log_{col}'] = np.log1p(yearly_data[col])

# Normalize numeric features
num_cols_yearly = ['log_population_density','log_infant_mortality_rate','log_Gross National Income Per Capita',
                   'Gender Inequality Index','Income Inequality','Life Expectancy at Birth',
                   'employment_rate','log_total_displacement','total_events'] + list(hazard_types)

scaler_yearly = StandardScaler()
yearly_data[num_cols_yearly] = scaler_yearly.fit_transform(yearly_data[num_cols_yearly])

# Encode categorical columns
le_country = LabelEncoder()
yearly_data['iso3'] = le_country.fit_transform(yearly_data['iso3'])
yearly_data['Country'] = le_country.fit_transform(yearly_data['Country'])

# Final country-year dataset
yearly_data_ready = yearly_data

yearly_data_ready.head()

Unnamed: 0,iso3,year,Gender Inequality Index,Gross National Income Per Capita,Income Inequality,Life Expectancy at Birth,Country,population_density,infant_mortality_rate,employment_rate,...,Mass Movement,Storm,Volcanic activity,Wave action,Wildfire,total_events,log_population_density,log_infant_mortality_rate,log_Gross National Income Per Capita,log_total_displacement
0,0,2013,1.568405,4932.186405,0.637987,-0.723258,0,1202.520865,32.7,-0.633035,...,-0.331839,-0.319172,-0.246807,-0.140649,-0.161492,-0.402225,1.024953,0.946095,-1.129732,0.984449
1,0,2014,1.508829,5205.479375,0.637987,-0.632356,0,1213.527917,31.4,-0.668423,...,-0.331839,-0.374753,-0.246807,-0.140649,-0.161492,-0.441365,1.030437,0.899557,-1.073118,0.838259
2,0,2015,1.469111,5582.619597,0.637987,-0.541798,0,1224.423285,30.0,-0.703293,...,-0.331839,-0.319172,-0.246807,-0.140649,-0.161492,-0.382655,1.035816,0.847303,-0.99969,0.83399
3,0,2016,1.429393,5987.015851,1.296524,-0.450382,0,1235.399339,28.6,-0.736709,...,-0.331839,-0.374753,-0.246807,-0.140649,-0.161492,-0.421795,1.041187,0.792634,-0.926272,0.861839
4,0,2017,1.323478,6228.122571,1.296524,-0.359136,0,1245.956419,27.4,-0.515869,...,-0.135854,-0.319172,-0.246807,-0.140649,-0.161492,-0.343515,1.046308,0.743675,-0.884824,0.945218


## Event-based ML

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np

# Use the preprocessed event dataset
data = event_data_ready.copy()

# Features to use (exclude target)
feature_cols = [
    'iso3','Country','hazard_type_name',
    'Gender Inequality Index','Income Inequality','Life Expectancy at Birth',
    'employment_rate','log_population_density','log_infant_mortality_rate','log_Gross National Income Per Capita',
    'month','quarter'
]

X = data[feature_cols]
y = data['log_new_displacement']  # use the log-transformed target

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define XGBoost regressor
xgb_event = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train
xgb_event.fit(X_train, y_train)

# Predict
y_pred_log = xgb_event.predict(X_test)

# Convert predictions back to original scale
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"Event-based model (log + normalized features) RMSE: {rmse:.2f}, R2: {r2:.2f}")

Event-based model (log + normalized features) RMSE: 2.53, R2: 0.23


### Hyperparameter tuning

Tis code uses Bayesian optimization. Sample code from https://medium.com/@dicee/optimizing-xgboost-a-guide-to-hyperparameter-tuning-77b6e48e289d.

In [8]:
# !pip install hyperopt

In [9]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'n_estimators': hp.quniform('n_estimators', 200, 800, 50)
}

def objective(params):

    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])

    model = XGBRegressor(
        **params,
        random_state=42,
        objective='reg:squarederror'
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))

    return {
        'loss': rmse,
        'status': STATUS_OK
    }

trials = Trials()

best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("Best hyperparameters for Event-based model:", best_params)

  import pkg_resources


100%|██████████| 100/100 [02:02<00:00,  1.22s/trial, best loss: 0.7213510377069919]
Best hyperparameters for Event-based model: {'colsample_bytree': np.float64(0.8956639128344293), 'learning_rate': np.float64(0.010187370341807344), 'max_depth': np.float64(7.0), 'n_estimators': np.float64(550.0), 'subsample': np.float64(0.760514269496543)}


In [10]:
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])

xgb_tuned = XGBRegressor(
    **best_params,
    random_state=42,
    objective='reg:squarederror'
)

xgb_tuned.fit(X_train, y_train)

y_pred_log = xgb_tuned.predict(X_test)

# Back to original scale
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"Event-based Tuned model RMSE: {rmse:.2f}, R2: {r2:.2f}")


Event-based Tuned model RMSE: 2.50, R2: 0.25


### Feature importance

In [11]:
importance = xgb_tuned.get_booster().get_score(importance_type='gain')
imp_df = pd.DataFrame(importance.items(), columns=['Feature','Gain']).sort_values(by='Gain', ascending=False)
print(imp_df.head())

                                Feature       Gain
1                               Country  16.656828
0                                  iso3  11.497835
7                log_population_density   5.066794
9  log_Gross National Income Per Capita   4.904979
2                      hazard_type_name   3.491461


## Year-Based ML

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np

# Use the preprocessed country-year dataset
data = yearly_data_ready.copy()

# Encode categorical columns
le_country = LabelEncoder()
data['iso3'] = le_country.fit_transform(data['iso3'])
data['Country'] = le_country.fit_transform(data['Country'])

# Define target: use log-transformed total displacement
y_disp = data['log_total_displacement']

# Define features: exclude target, year, and optionally hazard type counts
drop_cols = ['total_displacement','log_total_displacement','Country','year'] + list(hazard_types)
X = data.drop(columns=drop_cols)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_disp, test_size=0.2, random_state=42
)

# XGBoost regressor
xgb_year = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train
xgb_year.fit(X_train, y_train)

# Predict
y_pred_log = xgb_year.predict(X_test)

# Convert predictions back to original scale
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"Country-Year model (log + normalized features) RMSE: {rmse:.2f}, R2: {r2:.2f}")


Country-Year model (log + normalized features) RMSE: 0.59, R2: 0.73


### Hyperparameter tuning

In [13]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'n_estimators': hp.quniform('n_estimators', 200, 800, 50)
}

def objective(params):

    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])

    model = XGBRegressor(
        **params,
        random_state=42,
        objective='reg:squarederror'
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))

    return {
        'loss': rmse,
        'status': STATUS_OK
    }

trials = Trials()

best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("Best hyperparameters for Country-Year model:", best_params)

100%|██████████| 100/100 [00:29<00:00,  3.39trial/s, best loss: 0.333228114811884] 
Best hyperparameters for Country-Year model: {'colsample_bytree': np.float64(0.9905593551885842), 'learning_rate': np.float64(0.010515948466573109), 'max_depth': np.float64(3.0), 'n_estimators': np.float64(650.0), 'subsample': np.float64(0.8568381356995739)}


In [14]:
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])

xgb_tuned = XGBRegressor(
    **best_params,
    random_state=42,
    objective='reg:squarederror'
)

xgb_tuned.fit(X_train, y_train)

y_pred_log = xgb_tuned.predict(X_test)

# Back to original scale
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"Country-Year Tuned model RMSE: {rmse:.2f}, R2: {r2:.2f}")


Country-Year Tuned model RMSE: 0.53, R2: 0.78


### Feature importance

In [15]:
importance = xgb_tuned.get_booster().get_score(importance_type='gain')
imp_df = pd.DataFrame(importance.items(), columns=['Feature','Gain']).sort_values(by='Gain', ascending=False)
print(imp_df.head())

                            Feature      Gain
8                      total_events  9.301151
5                population_density  0.996300
2  Gross National Income Per Capita  0.990911
0                              iso3  0.760193
3                 Income Inequality  0.746912
