In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("Churn_Modelling.csv")

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
X = df.iloc[:, 3:-1]
y = df.iloc[:, -1]

X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.00,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,699,France,Female,39,1,0.00,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


## Encoding the categorical features

Let's create some functions that encode the categorical features using one-hot encoding

In [6]:
categorical_feature_names = ["Geography", "Gender"]

### Approach 1: using pandas

Problem: we are not **fitting** a one-hot encoder, but rather encoding the data we receive on the fly. As a consequence, if not all feature values are represented in the data we are passing to our `predict_proba`, we are going to get an error. Furthermore, there is no elegant way to handle unseen values. That's why using the one-hot encoding from sklearn is better.

In [50]:
pd.get_dummies(X, columns=categorical_feature_names)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0.00,1,1,1,101348.88,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,1,0
2,502,42,8,159660.80,3,1,0,113931.57,1,0,0,1,0
3,699,39,1,0.00,2,0,0,93826.63,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.10,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,1,0,0,0,1
9996,516,35,10,57369.61,1,1,1,101699.77,1,0,0,0,1
9997,709,36,7,0.00,1,0,1,42085.58,1,0,0,1,0
9998,772,42,3,75075.31,2,1,0,92888.52,0,1,0,0,1


In [40]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = pd.get_dummies(x_train, columns=categorical_feature_names)
x_val_one_hot = pd.get_dummies(x_val, columns=categorical_feature_names)

In [41]:
x_train_one_hot

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
7389,667,34,5,0.00,2,1,0,163830.64,0,0,1,1,0
9275,427,42,1,75681.52,1,1,1,57098.00,0,1,0,0,1
2995,535,29,2,112367.34,1,1,0,185630.76,1,0,0,1,0
5316,654,40,5,105683.63,1,1,0,173617.09,0,0,1,0,1
356,850,57,8,126776.30,2,1,1,132298.49,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,594,32,4,120074.97,2,1,1,162961.79,0,1,0,1,0
4859,794,22,4,114440.24,1,1,1,107753.07,0,0,1,1,0
3264,738,35,5,161274.05,2,1,0,181429.87,1,0,0,0,1
9845,590,38,9,0.00,2,1,1,148750.16,0,0,1,1,0


In [42]:
sklearn_model = LogisticRegression(random_state=1300)
sklearn_model.fit(x_train_one_hot, y_train)

In [43]:
def predict_proba(model, input_features: np.ndarray, column_names, categorical_feature_names):
    df = pd.DataFrame(input_features, columns=column_names)
    encoded_df = pd.get_dummies(df, columns=categorical_feature_names)
    return model.predict_proba(encoded_df.to_numpy())

In [47]:
predict_proba(model=sklearn_model, 
              input_features=x_val.iloc[:3, :].to_numpy(), 
              column_names=list(X.columns), 
              categorical_feature_names=["Geography", "Gender"])



ValueError: X has 12 features, but LogisticRegression is expecting 13 features as input.

### Approach 2: using sklearn

In [73]:
def encode_one_hot(df, categorical_feature_names):
    from sklearn.preprocessing import OneHotEncoder
    
    df = df.copy(True)
    df.reset_index(drop=True, inplace=True)
    
    for categorical_feature in categorical_feature_names:
        print(f"Encoding {categorical_feature}")
        enc = OneHotEncoder(handle_unknown="ignore")
        enc.fit(df[categorical_feature].to_numpy().reshape(-1, 1))
        print(f"The values encountered are: {enc.categories_}")
        enc_column = enc.transform(df[categorical_feature].to_numpy().reshape(-1, 1)).toarray()
        print(f"Sample encoded features: {enc_column[:3, :]}")
        print(f"Joining the dfs")
        enc_df = pd.DataFrame(enc_column, columns=enc.get_feature_names_out())
        df = df.join(enc_df)
        df = df.drop(columns=categorical_feature)
        print(f"Successfully encoded {categorical_feature}")
        print("-------------------------------------------")
    
    return df

In [74]:
encode_one_hot(X, categorical_feature_names)

Encoding Geography
The values encountered are: [array(['France', 'Germany', 'Spain'], dtype=object)]
Sample encoded features: [[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
Joining the dfs
Successfully encoded Geography
-------------------------------------------
Encoding Gender
The values encountered are: [array(['Female', 'Male'], dtype=object)]
Sample encoded features: [[1. 0.]
 [1. 0.]
 [1. 0.]]
Joining the dfs
Successfully encoded Gender
-------------------------------------------


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,x0_France,x0_Germany,x0_Spain,x0_Female,x0_Male
0,619,42,2,0.00,1,1,1,101348.88,1.0,0.0,0.0,1.0,0.0
1,608,41,1,83807.86,1,0,1,112542.58,0.0,0.0,1.0,1.0,0.0
2,502,42,8,159660.80,3,1,0,113931.57,1.0,0.0,0.0,1.0,0.0
3,699,39,1,0.00,2,0,0,93826.63,1.0,0.0,0.0,1.0,0.0
4,850,43,2,125510.82,1,1,1,79084.10,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,1.0,0.0,0.0,0.0,1.0
9996,516,35,10,57369.61,1,1,1,101699.77,1.0,0.0,0.0,0.0,1.0
9997,709,36,7,0.00,1,0,1,42085.58,1.0,0.0,0.0,1.0,0.0
9998,772,42,3,75075.31,2,1,0,92888.52,0.0,1.0,0.0,0.0,1.0


In [75]:
categorical_feature_names = ["Geography", "Gender"]
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = encode_one_hot(x_train, categorical_feature_names)
x_val_one_hot = encode_one_hot(x_val, categorical_feature_names)

Encoding Geography
The values encountered are: [array(['France', 'Germany', 'Spain'], dtype=object)]
Sample encoded features: [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]
Joining the dfs
Successfully encoded Geography
-------------------------------------------
Encoding Gender
The values encountered are: [array(['Female', 'Male'], dtype=object)]
Sample encoded features: [[1. 0.]
 [0. 1.]
 [1. 0.]]
Joining the dfs
Successfully encoded Gender
-------------------------------------------
Encoding Geography
The values encountered are: [array(['France', 'Germany', 'Spain'], dtype=object)]
Sample encoded features: [[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]
Joining the dfs
Successfully encoded Geography
-------------------------------------------
Encoding Gender
The values encountered are: [array(['Female', 'Male'], dtype=object)]
Sample encoded features: [[1. 0.]
 [1. 0.]
 [1. 0.]]
Joining the dfs
Successfully encoded Gender
-------------------------------------------


In [76]:
sklearn_model = LogisticRegression(random_state=1300)
sklearn_model.fit(x_train_one_hot, y_train)

In [77]:
def predict_proba(model, input_features: np.ndarray, column_names, categorical_feature_names, encode_one_hot):
    print(f"Feeding data to the model")
    df = pd.DataFrame(input_features, columns=column_names)
    
    encoded_df = encode_one_hot(df, categorical_feature_names)
    
    print(f"Tome!")
    return model.predict_proba(encoded_df.to_numpy())

In [79]:
predict_proba(model=sklearn_model, 
              input_features=x_val.iloc[:3, :].to_numpy(), 
              column_names=list(X.columns), 
              categorical_feature_names=["Geography", "Gender"],
             encode_one_hot=encode_one_hot)

Feeding data to the model
Encoding Geography
The values encountered are: [array(['France', 'Germany', 'Spain'], dtype=object)]
Sample encoded features: [[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]
Joining the dfs
Successfully encoded Geography
-------------------------------------------
Encoding Gender
The values encountered are: [array(['Female'], dtype=object)]
Sample encoded features: [[1.]
 [1.]
 [1.]]
Joining the dfs
Successfully encoded Gender
-------------------------------------------
Tome!




ValueError: X has 12 features, but LogisticRegression is expecting 13 features as input.