#Crashout 2.0

Predicting whether or not an African country will have a banking crisis.

We start by importing necessary libraries

In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

from imblearn.over_sampling import SMOTE
from google.colab import drive

Let's import the data set from my drive

In [2]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/ALU/datasets/african_crises2.csv"
dataset = pd.read_csv(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Let's preview some of the info

In [3]:
dataset.shape

(1059, 14)

In [4]:
dataset.head(10)

Unnamed: 0,country_number,country_code,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.05168,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis
5,1,DZA,Algeria,1875,0,0.051546,0,0,0.0,-20.924178,0,0,0,no_crisis
6,1,DZA,Algeria,1876,0,0.051867,0,0,0.0,-1.769547,0,0,0,no_crisis
7,1,DZA,Algeria,1877,0,0.051867,0,0,0.0,29.116045,0,0,1,no_crisis
8,1,DZA,Algeria,1878,0,0.051948,0,0,0.0,-1.492537,0,0,0,no_crisis
9,1,DZA,Algeria,1879,0,0.052029,0,0,0.0,-16.831357,0,0,0,no_crisis


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_number                   1059 non-null   int64  
 1   country_code                     1059 non-null   object 
 2   country                          1059 non-null   object 
 3   year                             1059 non-null   int64  
 4   systemic_crisis                  1059 non-null   int64  
 5   exch_usd                         1059 non-null   float64
 6   domestic_debt_in_default         1059 non-null   int64  
 7   sovereign_external_debt_default  1059 non-null   int64  
 8   gdp_weighted_default             1059 non-null   float64
 9   inflation_annual_cpi             1059 non-null   float64
 10  independence                     1059 non-null   int64  
 11  currency_crises                  1059 non-null   int64  
 12  inflation_crises    

#Preprocessing



Handle missing values

In [6]:
print(dataset.isnull().mean() * 100)


country_number                     0.0
country_code                       0.0
country                            0.0
year                               0.0
systemic_crisis                    0.0
exch_usd                           0.0
domestic_debt_in_default           0.0
sovereign_external_debt_default    0.0
gdp_weighted_default               0.0
inflation_annual_cpi               0.0
independence                       0.0
currency_crises                    0.0
inflation_crises                   0.0
banking_crisis                     0.0
dtype: float64


We find no missing values here.

Convert the target (banking_crisis) column to binary. 1 for crisis 0 for no crisit.

In [7]:
dataset['banking_crisis'] = dataset['banking_crisis'].replace(['no_crisis','crisis'],[0,1])

  dataset['banking_crisis'] = dataset['banking_crisis'].replace(['no_crisis','crisis'],[0,1])


Since we don't have many countries, we can afford to perform on hot encoding. we also don't need 2 country columns.

In [8]:
le = LabelEncoder()
dataset['country'] = le.fit_transform(dataset['country'])
dataset.drop(columns=['country','country_code'],inplace=True)
dataset.head(1000)

Unnamed: 0,country_number,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,1870,1,5.226400e-02,0,0,0.0,3.441456,0,0,0,1
1,1,1871,0,5.279800e-02,0,0,0.0,14.149140,0,0,0,0
2,1,1872,0,5.227400e-02,0,0,0.0,-3.718593,0,0,0,0
3,1,1873,0,5.168000e-02,0,0,0.0,11.203897,0,0,0,0
4,1,1874,0,5.130800e-02,0,0,0.0,-3.848561,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,70,1950,0,7.140000e-27,0,0,0.0,7.692308,0,0,0,0
996,70,1951,0,7.140000e-27,0,0,0.0,6.964286,0,0,0,0
997,70,1952,0,7.140000e-27,0,0,0.0,8.180301,0,0,0,0
998,70,1953,0,7.140000e-27,0,0,0.0,2.777778,0,0,0,0


Now we need to scale features like exch_usd,inflaction_annual_cpi and gdp

In [9]:
numerical_columns = ['exch_usd','gdp_weighted_default','inflation_annual_cpi']

scaler = StandardScaler()
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])

#Spit & Train

Set target and some feature selection

In [10]:
#setting a target, y
y = dataset['banking_crisis']

#setting features, x
X = dataset.drop(columns=['banking_crisis'])
print(X )
selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=8, step=1)
selector = selector.fit(X, y)
selected_features = X.columns[selector.support_]
X = dataset[selected_features]
#print(selected_features)


      country_number  year  systemic_crisis  exch_usd  \
0                  1  1870                1 -0.386713   
1                  1  1871                0 -0.386708   
2                  1  1872                0 -0.386712   
3                  1  1873                0 -0.386718   
4                  1  1874                0 -0.386721   
...              ...   ...              ...       ...   
1054              70  2009                1  2.797088   
1055              70  2010                0  3.007099   
1056              70  2011                0  2.860809   
1057              70  2012                0  2.860809   
1058              70  2013                0  2.860809   

      domestic_debt_in_default  sovereign_external_debt_default  \
0                            0                                0   
1                            0                                0   
2                            0                                0   
3                            0                 

split

In [11]:
X_train, X_test, y_train, y_test =  train_test_split(X,y,test_size=0.25,random_state=42)



Parameter tuning

In [12]:
param_grid = {
    'n_estimators':[50,100,200],
    'max_depth': [None, 10,20],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4]
}


grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5,scoring='f1')
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


Let the training begin!

In [13]:
rf = RandomForestClassifier(max_depth=10, min_samples_leaf=1,min_samples_split=2,class_weight='balanced',n_estimators= 50)
rf.fit(X_train,y_train)

Make Prediction

In [14]:
y_pred = rf.predict(X_test)

Evaluate prediction

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       244
           1       0.91      1.00      0.95        21

    accuracy                           0.99       265
   macro avg       0.96      1.00      0.98       265
weighted avg       0.99      0.99      0.99       265



In [18]:
from sklearn.metrics import roc_auc_score

y_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")


ROC AUC Score: 0.9991


In [20]:
import joblib
joblib.dump(rf, 'random_forest_model.pkl')


['random_forest_model.pkl']