# set the imports 

In [7]:
# set my imports
# Importing the Pandas library for data manipulation and analysis.
import pandas as pd

# Importing RandomizedSearchCV for hyperparameter tuning and cross_val_score for evaluating model performance.
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

# Importing the RandomForestClassifier from sklearn's ensemble module for classification tasks using Random Forest algorithm.
from sklearn.ensemble import RandomForestClassifier

# Importing XGBClassifier from XGBoost library for gradient boosting on decision trees.
from xgboost import XGBClassifier

# Importing randint and uniform from scipy.stats for generating random integers and uniform distributions for hyperparameter tuning.
from scipy.stats import randint, uniform

# Importing the custom HelperFunctions class from the lib.utilities.help_functions module for data preprocessing utilities.
from lib.utilities.help_functions import HelperFunctions

# Importing joblib for saving and loading Python objects efficiently, especially large numpy arrays.
import joblib

# Importing pickle for serializing and deserializing Python objects, commonly used for saving trained models.
import pickle



# load  my dataset 

In [8]:
data = pd.read_csv("datasets/train.csv")

In [9]:
data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


# Perform Explorative Data  Analysis  

In [10]:
HelperFunctions().generate_profile_report(data , "ydata_settings_folder/ydata_profiling_settings.yaml" ,"eda_output_folder/eda_report.html" )


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# PreProcess My  Dataset 

In [12]:
# train_data = pd.read_csv("train.csv")
df = data.copy() 

preprocessed_data_set   =HelperFunctions().scale_dataframe(HelperFunctions().encode_features(df.drop(columns=['CustomerId', 'Surname']) ,global_encoding_method ="one-hot" ), method='minmax') 





# display my preprocessed dataset 

In [13]:
preprocessed_data_set.head()

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.0,0.636,0.202703,0.3,0.0,0.333333,1.0,0.0,0.907279,0.0,1.0,0.0,0.0,0.0,1.0
1,6e-06,0.554,0.202703,0.1,0.0,0.333333,1.0,1.0,0.247483,0.0,1.0,0.0,0.0,0.0,1.0
2,1.2e-05,0.656,0.297297,1.0,0.0,0.333333,1.0,0.0,0.924364,0.0,1.0,0.0,0.0,0.0,1.0
3,1.8e-05,0.462,0.216216,0.2,0.593398,0.0,1.0,1.0,0.422787,0.0,1.0,0.0,0.0,0.0,1.0
4,2.4e-05,0.732,0.202703,0.5,0.0,0.333333,1.0,1.0,0.075293,0.0,0.0,0.0,1.0,0.0,1.0


# split my dataset to train and validation 

In [14]:
X_train = preprocessed_data_set.drop(columns=['Exited'])
y_train = preprocessed_data_set['Exited']

# create my  random forest and xgb boost models  and   tune  the hyperparamters

In [15]:
param_dist_rf = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': randint(2, 20),
}

rf_model = RandomForestClassifier(random_state=42)


rf_random = RandomizedSearchCV(rf_model, param_distributions=param_dist_rf, n_iter=10, cv=5, scoring='accuracy', random_state=42)
rf_random.fit(X_train, y_train)
best_rf_model = rf_random.best_estimator_

In [16]:
param_dist_xgb = {
    'n_estimators': randint(50, 200),
    'max_depth': [3, 5, 7, 9],
    'learning_rate': uniform(0.01, 0.2),
}

xgb_model = XGBClassifier(random_state=42)


xgb_random = RandomizedSearchCV(xgb_model, param_distributions=param_dist_xgb, n_iter=10, cv=5, scoring='accuracy', random_state=42)
xgb_random.fit(X_train, y_train)
best_xgb_model = xgb_random.best_estimator_

In [17]:
models = [
    ('RandomForest', best_rf_model),
    ('XGBoost', best_xgb_model)
]

for name, model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f'{name} - Accuracy: {scores.mean()} (Std: {scores.std()})')

RandomForest - Accuracy: 0.8487218718738727 (Std: 0.019076120284831244)
XGBoost - Accuracy: 0.8632221172234915 (Std: 0.001616672491019577)


# save the best Model to use it later for my Bank chrun predicton App  

In [51]:
best_model = best_xgb_model  # Access the classifier part of the pipeline

best_model.fit(X_train, y_train)


# Save the trained model using joblib
joblib.dump(best_model, 'trained_models/trained_model.joblib')


# Saving the model
with open('trained_models/trained_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)


# import my test dataset and perform prediction on it using the trained model 

In [64]:
test_dataset = pd.read_csv("datasets/test.csv")

In [85]:
test_dataset.head().iloc[0 , : ]


id                    165034
CustomerId          15773898
Surname             Lucchese
CreditScore              586
Geography             France
Gender                Female
Age                     23.0
Tenure                     2
Balance                  0.0
NumOfProducts              2
HasCrCard                0.0
IsActiveMember           1.0
EstimatedSalary    160976.75
Name: 0, dtype: object

In [None]:
costumer_id = test_dataset["id"]

In [91]:
# Prepare the input data (this keeps it as a DataFrame)
processed_input = HelperFunctions().scale_dataframe(
    HelperFunctions().encode_features(
        test_dataset.copy().drop(columns=["CustomerId", "Surname"]),
        global_encoding_method="one-hot"
    ),
    method='minmax'
)

print(processed_input.iloc[[0]])
# Select the first row while keeping it as a DataFrame
prediction_prob = best_model.predict_proba(processed_input.iloc[[0]])  # Use [[0]] to keep it as DataFrame


    id  CreditScore       Age  Tenure  Balance  NumOfProducts  HasCrCard  \
0  0.0        0.472  0.067568     0.2      0.0       0.333333        0.0   

   IsActiveMember  EstimatedSalary  Geography_France  Geography_Germany  \
0             1.0         0.804903               1.0                0.0   

   Geography_Spain  Gender_Female  Gender_Male  
0              0.0            1.0          0.0  


In [92]:
prediction_prob

array([[0.9746609 , 0.02533911]], dtype=float32)

In [88]:
HelperFunctions().scale_dataframe(HelperFunctions().encode_features(test_dataset.copy().drop(columns=["CustomerId","Surname"]) ,global_encoding_method ="one-hot" ), method='minmax').iloc[0 , : ]

id                   0.000000
CreditScore          0.472000
Age                  0.067568
Tenure               0.200000
Balance              0.000000
NumOfProducts        0.333333
HasCrCard            0.000000
IsActiveMember       1.000000
EstimatedSalary      0.804903
Geography_France     1.000000
Geography_Germany    0.000000
Geography_Spain      0.000000
Gender_Female        1.000000
Gender_Male          0.000000
Name: 0, dtype: float64

In [74]:
prediction_prob[: , 1]

array([0.02533911, 0.7787184 , 0.0429614 , ..., 0.0277605 , 0.14928049,
       0.2317597 ], dtype=float32)

In [75]:
test_result_df = pd.DataFrame({'id': costumer_id, 'Exited': prediction_prob[: , 1]})


In [77]:
test_result_df

Unnamed: 0,id,Exited
0,165034,0.025339
1,165035,0.778718
2,165036,0.042961
3,165037,0.280265
4,165038,0.340417
...,...,...
110018,275052,0.053609
110019,275053,0.103147
110020,275054,0.027760
110021,275055,0.149280
