In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import make_scorer, precision_score, f1_score

In [None]:
file_id = '1nCbwq2PktId7IlJmlR5kGn7vvpgiE8vW'
file_url = f'https://drive.google.com/uc?id={file_id}'
data = pd.read_csv(file_url)

In [None]:
data=data.drop('ID',axis=1)

In [None]:
data.isna().sum()

Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [None]:
# Assuming 'data' is the DataFrame that contains both  lead_1_data and lead_0_data

# Fill missing values in 'Credit_Product' column with 'Yes' where Lead is 1
data.loc[data['Is_Lead']==1,'Credit_Product']=data.loc[data['Is_Lead'] == 1, 'Credit_Product'].fillna('Yes')

# Fill missing values in 'Credit_Product' column with 'No' where Lead is 0
data.loc[data['Is_Lead']==0,'Credit_Product']=data.loc[data['Is_Lead'] == 0, 'Credit_Product'].fillna('No')

In [None]:
data.isna().sum()

Gender                 0
Age                    0
Region_Code            0
Occupation             0
Channel_Code           0
Vintage                0
Credit_Product         0
Avg_Account_Balance    0
Is_Active              0
Is_Lead                0
dtype: int64

In [None]:
data['Region_Code'] = data['Region_Code'].str.replace('RG', '').astype(int)

In [None]:
data['Channel_Code'] = data['Channel_Code'].str.replace('X', '').astype(int)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Select the categorical columns to encode
categorical_columns = ['Gender', 'Occupation', 'Is_Active', 'Credit_Product']

# Create a OneHotEncoder instance
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit and transform the encoder on the selected categorical columns
encoded_columns = encoder.fit_transform(data[categorical_columns])

# Create a DataFrame from the encoded_columns array with column names
encoded_data = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns from 'data'
data.drop(categorical_columns, axis=1, inplace=True)

# Concatenate the original 'data' DataFrame with the encoded_data DataFrame
data = pd.concat([data, encoded_data], axis=1)


In [None]:
#transformation technique
#log transformation:

# Log transformation on the "avg_account_balance" column
data["Avg_Account_Balance"] = data["Avg_Account_Balance"].map(lambda i: np.log(i) if i > 0 else 0)

In [None]:
data

Unnamed: 0,Age,Region_Code,Channel_Code,Vintage,Avg_Account_Balance,Is_Lead,Gender_Male,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Is_Active_Yes,Credit_Product_Yes
0,73,268,3,43,13.860193,0,0.0,1.0,0.0,0.0,0.0,0.0
1,30,277,1,32,13.274205,0,0.0,0.0,1.0,0.0,0.0,0.0
2,56,268,3,26,14.210464,0,0.0,0.0,0.0,1.0,1.0,0.0
3,34,270,1,19,13.061453,0,1.0,0.0,1.0,0.0,0.0,0.0
4,30,282,1,33,13.695360,0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
245720,51,284,3,109,14.470741,0,1.0,0.0,0.0,1.0,0.0,0.0
245721,27,268,1,15,13.668114,0,1.0,0.0,1.0,0.0,1.0,0.0
245722,26,281,1,13,13.416016,0,0.0,0.0,1.0,0.0,0.0,0.0
245723,28,273,1,31,12.917806,0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
#Assuming 'data' is your DataFrame, and 'Age' is the column representing ages
 # Define the bin edges and corresponding labels
age_bins = [0, 30, 50, float('inf')]  # Customize the age bins as per your preference
age_labels = ['Young', 'Middle-aged', 'Senior']

# Create the 'Age_Group' column based on the age bins
data['Age_Group'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels, right=False)

In [None]:
data.drop(columns=['Age'], inplace=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Select the categorical columns to encode
categorical_columns = [ 'Age_Group']

# Create a OneHotEncoder instance
encoder = OneHotEncoder(drop='first', sparse=False)

# Fit and transform the encoder on the selected categorical columns
encoded_columns = encoder.fit_transform(data[categorical_columns])

# Create a DataFrame from the encoded_columns array with column names
encoded_data = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns from 'data'
data.drop(categorical_columns, axis=1, inplace=True)

# Concatenate the original 'data' DataFrame with the encoded_data DataFrame
data = pd.concat([data, encoded_data], axis=1)




In [None]:
data

Unnamed: 0,Region_Code,Channel_Code,Vintage,Avg_Account_Balance,Is_Lead,Gender_Male,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Is_Active_Yes,Credit_Product_Yes,Age_Group_Senior,Age_Group_Young
0,268,3,43,13.860193,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,277,1,32,13.274205,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,268,3,26,14.210464,0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,270,1,19,13.061453,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,282,1,33,13.695360,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245720,284,3,109,14.470741,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
245721,268,1,15,13.668114,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
245722,281,1,13,13.416016,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
245723,273,1,31,12.917806,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
x=data.drop('Is_Lead',axis=1)

In [None]:
y=data['Is_Lead']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# ***HYPERPARAMETER TUNING ***

# GBM Using SMOTE

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

# Define the parameter grid for GBM
gbm_params = {
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': np.arange(50, 200, 10),
    'subsample': [0.8, 0.9, 1.0],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [None, 42],
}

# Initialize the GBM classifier
gbm_model = GradientBoostingClassifier()

# Initialize SMOTE
smote = SMOTE()

# Apply SMOTE to the training data
x_train_smote, y_train_smote = smote.fit_resample(x, y)

# Create a RandomizedSearchCV instance with cross-validation
cv_model_gbm = RandomizedSearchCV(estimator=gbm_model, param_distributions=gbm_params,
                                  scoring='f1', n_jobs=-1, return_train_score=True,
                                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))

# Fit the RandomizedSearchCV on the SMOTE-augmented training data
cv_model_gbm.fit(x_train_smote, y_train_smote)

# Get the best parameters from the search
best_params_gbm = cv_model_gbm.best_params_

# Print the best parameters
print("Best Parameters for GBM (F1 Score):", best_params_gbm)

Best Parameters for GBM (F1 Score): {'subsample': 1.0, 'random_state': None, 'n_estimators': 140, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 3, 'learning_rate': 0.2}


In [None]:
from sklearn.metrics import make_scorer, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
# Create a SMOTE instance
smote = SMOTE(random_state=42)

# Initialize StratifiedKFold for stratified cross-validation with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gb_params = {'subsample': 1.0,
             'random_state': None,
             'n_estimators': 140,
             'min_samples_split': 10,
             'min_samples_leaf': 4,
             'max_features': None,
             'max_depth': 3,
             'learning_rate': 0.2
             }

# Initialize lists to store precision and F1 scores
precision_scores_gb = []
f1_scores_gb = []

# Perform stratified cross-validation
for train_index, test_index in stratified_kfold.split(x, y):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to the training data
    x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

    # Initialize the Gradient Boosting classifier
    gb_classifier = GradientBoostingClassifier(**gb_params)

    # Fit the classifier on the SMOTE-augmented training data
    gb_classifier.fit(x_train_smote, y_train_smote)

    # Predict on the test data
    y_pred = gb_classifier.predict(x_test)

    # Calculate precision and F1 scores
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision_scores_gb.append(precision)
    f1_scores_gb.append(f1)

# Print cross-validation precision scores for Gradient Boosting
print("Cross-Validation Precision Scores (Gradient Boosting):", precision_scores_gb)
print("Mean Precision (Gradient Boosting):", sum(precision_scores_gb) / len(precision_scores_gb))
print("Standard Deviation (Precision) (Gradient Boosting):", np.std(precision_scores_gb))

# Print cross-validation F1 scores for Gradient Boosting
print("Cross-Validation F1 Scores (Gradient Boosting):", f1_scores_gb)
print("Mean F1 Score (Gradient Boosting):", sum(f1_scores_gb) / len(f1_scores_gb))
print("Standard Deviation (F1) (Gradient Boosting):", np.std(f1_scores_gb))

Cross-Validation Precision Scores (Gradient Boosting): [0.5959380378657487, 0.5950145007595636, 0.5963682254785109, 0.5972608453837598, 0.604588963963964]
Mean Precision (Gradient Boosting): 0.5978341146903093
Standard Deviation (Precision) (Gradient Boosting): 0.003454006011343435
Cross-Validation F1 Scores (Gradient Boosting): [0.6612176304331221, 0.6593213206320059, 0.6563018402067823, 0.6597803548114585, 0.6641923760921674]
Mean F1 Score (Gradient Boosting): 0.6601627044351073
Standard Deviation (F1) (Gradient Boosting): 0.0025733834946772565
