In [39]:
import pandas as pd
import joblib

In [40]:
model = joblib.load('../outputs/model/RF/rf_fine_tuned_model.pkl')
df = pd.read_csv('../data/raw/ebi_exp_customers.csv', delimiter=';', skiprows=1)

In [41]:
df

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,15575430,579,France,Female,33,1,118392.75,1,1,1,157564.75
1,15793890,728,France,Female,59,4,0.00,1,1,1,163365.85
2,15701166,660,France,Male,40,5,131754.11,2,1,1,38761.61
3,15624677,543,Germany,Female,37,3,122304.65,2,0,0,33998.70
4,15586264,572,France,Male,43,2,140431.98,1,1,0,26450.57
...,...,...,...,...,...,...,...,...,...,...,...
995,15691624,820,France,Male,33,2,132150.26,2,1,0,23067.97
996,15799790,763,France,Male,35,9,0.00,1,1,1,31372.91
997,15788224,669,Germany,Male,45,1,123949.75,1,0,0,110881.56
998,15619016,660,Germany,Male,46,5,109019.65,2,1,1,33680.56


In [42]:
# safely drop customer_id if it exists (no KeyError if missing)
df.drop(columns=['customer_id'], inplace=True)

In [43]:
df

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,579,France,Female,33,1,118392.75,1,1,1,157564.75
1,728,France,Female,59,4,0.00,1,1,1,163365.85
2,660,France,Male,40,5,131754.11,2,1,1,38761.61
3,543,Germany,Female,37,3,122304.65,2,0,0,33998.70
4,572,France,Male,43,2,140431.98,1,1,0,26450.57
...,...,...,...,...,...,...,...,...,...,...
995,820,France,Male,33,2,132150.26,2,1,0,23067.97
996,763,France,Male,35,9,0.00,1,1,1,31372.91
997,669,Germany,Male,45,1,123949.75,1,0,0,110881.56
998,660,Germany,Male,46,5,109019.65,2,1,1,33680.56


In [44]:
map_gender = {
    'Male': 1,
    'Female': 0
}

df['gender'] = df['gender'].map(map_gender)

df = pd.get_dummies(df,columns=['country'], drop_first=True)


In [45]:
df

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_Germany,country_Spain
0,579,0,33,1,118392.75,1,1,1,157564.75,False,False
1,728,0,59,4,0.00,1,1,1,163365.85,False,False
2,660,1,40,5,131754.11,2,1,1,38761.61,False,False
3,543,0,37,3,122304.65,2,0,0,33998.70,True,False
4,572,1,43,2,140431.98,1,1,0,26450.57,False,False
...,...,...,...,...,...,...,...,...,...,...,...
995,820,1,33,2,132150.26,2,1,0,23067.97,False,False
996,763,1,35,9,0.00,1,1,1,31372.91,False,False
997,669,1,45,1,123949.75,1,0,0,110881.56,True,False
998,660,1,46,5,109019.65,2,1,1,33680.56,True,False


In [48]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

standard_scaler = StandardScaler()
robust_scaler = RobustScaler()


In [49]:
df[['credit_score']] = standard_scaler.fit_transform(df[['credit_score']])
df[['balance','estimated_salary', 'age']] = robust_scaler.fit_transform(df[['balance', 'estimated_salary','age']])
df

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_Germany,country_Spain
0,-0.748157,0,-0.307692,1,0.168979,1,1,1,0.627520,False,False
1,0.782951,0,1.692308,4,-0.752413,1,1,1,0.687498,False,False
2,0.084190,1,0.230769,5,0.272964,2,1,1,-0.600779,False,False
3,-1.118089,0,0.000000,3,0.199424,2,0,0,-0.650022,True,False
4,-0.820088,1,0.461538,2,0.340500,1,1,0,-0.728062,False,False
...,...,...,...,...,...,...,...,...,...,...,...
995,1.728332,1,-0.307692,2,0.276047,2,1,0,-0.763035,False,False
996,1.142607,1,-0.153846,9,-0.752413,1,1,1,-0.677170,False,False
997,0.176673,1,0.615385,1,0.212227,1,0,0,0.144865,True,False
998,0.084190,1,0.692308,5,0.096033,2,1,1,-0.653312,True,False


In [50]:
prediction = model.predict(df)
probability = model.predict_proba(df)[:,1]

In [53]:
df['churn_prediction'] = prediction
df['churn_probability'] = probability

In [54]:
df

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_Germany,country_Spain,churn_prediction,churn_probability
0,-0.748157,0,-0.307692,1,0.168979,1,1,1,0.627520,False,False,0,0.284479
1,0.782951,0,1.692308,4,-0.752413,1,1,1,0.687498,False,False,1,0.609336
2,0.084190,1,0.230769,5,0.272964,2,1,1,-0.600779,False,False,0,0.264533
3,-1.118089,0,0.000000,3,0.199424,2,0,0,-0.650022,True,False,0,0.442120
4,-0.820088,1,0.461538,2,0.340500,1,1,0,-0.728062,False,False,1,0.706770
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.728332,1,-0.307692,2,0.276047,2,1,0,-0.763035,False,False,0,0.228524
996,1.142607,1,-0.153846,9,-0.752413,1,1,1,-0.677170,False,False,0,0.280597
997,0.176673,1,0.615385,1,0.212227,1,0,0,0.144865,True,False,1,0.808736
998,0.084190,1,0.692308,5,0.096033,2,1,1,-0.653312,True,False,1,0.538217


In [55]:
df['churn_prediction'].value_counts()

churn_prediction
0    693
1    307
Name: count, dtype: int64