In [5]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, log_loss, classification_report
)
import warnings

In [6]:
%run 2_EDA.ipynb

  brizo_data = pd.read_csv(file_path, encoding="latin-1")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9216 entries, 0 to 9215
Columns: 121 entries, Brizo_ID to Retention_Propensity
dtypes: float64(10), int64(23), object(88)
memory usage: 8.5+ MB
   Restaurant_ID  Retention_Propensity Popmenu_Location_Segment  \
0          17705                 96.28          Single Location   
1          46892                 98.09          Single Location   
2          10302                 99.21          Single Location   
3           8691                 97.49          Single Location   
4          50959                 97.35          Single Location   

                Business_Type State/Province  Density_@5mi Price_Range  \
0                  Restaurant        Florida           324        $$$$   
1                  Restaurant        Arizona           771        $$$$   
2                  Restaurant       Illinois          6367        $$$$   
3                  Restaurant          Texas          4398          $$   
4  Drinking Place, Restaurant    

In [7]:
lead_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8948 entries, 0 to 9215
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Restaurant_ID                 8948 non-null   object 
 1   Retention_Propensity          8948 non-null   float64
 2   Popmenu_Location_Segment      8948 non-null   object 
 3   State/Province                8948 non-null   object 
 4   Density_@5mi                  8948 non-null   int64  
 5   Median_Price                  8948 non-null   int64  
 6   Cuisines_(Continental)        8948 non-null   object 
 7   State/Province_Encoded        8948 non-null   int64  
 8   Price_Range_Encoded           8948 non-null   int64  
 9   Instagram_Followers           8948 non-null   float64
 10  Facebook_Followers            8948 non-null   float64
 11  Google_Review_Score           8948 non-null   float64
 12  OpenTable_Review_Score        8948 non-null   float64
 13  TripAdvi

In [8]:
# Target Feature
lead_fit_score = (lead_data["Retention_Propensity"] >= 99).astype(int)

print(lead_fit_score.value_counts())

Retention_Propensity
0    7332
1    1616
Name: count, dtype: int64


In [9]:
# Drop columns if they exist in the DataFrame
columns_to_drop = [
    "Restaurant_ID",
    "Retention_Propensity",
    "Popmenu_Location_Segment",
    "State/Province",
    "Median_Price",
    "Cuisines_(Continental)"
]

lead_data = lead_data.drop(columns=[col for col in columns_to_drop if col in lead_data.columns], errors='ignore')

# Display the first 5 rows
lead_data.head(5)

Unnamed: 0,Density_@5mi,State/Province_Encoded,Price_Range_Encoded,Instagram_Followers,Facebook_Followers,Google_Review_Score,OpenTable_Review_Score,TripAdvisor_Review_Score,Facebook_Review_Score,DoorDash_Review_Score,...,Tech_Overlapping_Caviar,Tech_Overlapping_Paytronix,Cusine_North_American,Cusine_European,Cusine_Latin_American,Cusine_Asian,Business_Type_Restaurant,Business_Type_Drinking_Place,Business_Type_Quick_Service,Business_Type_Cafe
0,324,9,3,0.0,32000.0,4.5,4.7,0.0,0.0,0.0,...,0,0,0,0,0,1,1,0,0,0
1,771,2,3,1896.0,8443.0,4.7,4.8,4.5,4.7,0.0,...,0,0,1,0,0,0,1,0,0,0
2,6367,13,3,16983.0,14000.0,4.6,4.9,4.0,0.0,0.0,...,0,0,0,1,0,0,1,0,0,0
3,4398,44,1,2750.0,6500.0,4.4,4.8,4.0,0.0,5.0,...,1,0,1,0,0,0,1,0,0,0
4,1712,9,1,8357.0,25341.0,4.3,4.5,4.0,4.4,4.4,...,0,0,0,0,0,1,1,1,0,0


In [10]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    lead_data, lead_fit_score, test_size=0.2, stratify=lead_fit_score, random_state=42
)

# Train logistic regression model
model = LogisticRegression(class_weight='balanced', max_iter=500)
model.fit(X_train, y_train)

# Get predicted probabilities for the positive class (1)
y_prob = model.predict_proba(X_test)[:, 1]

# Print probabilities instead of binary predictions
print("Predicted probabilities:")
print(y_prob)

Predicted probabilities:
[0.47521222 0.49901435 0.48822967 ... 0.49055187 0.50714279 0.33921141]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Train
model.fit(X_train, y_train)

# Evaluate
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Log Loss:", log_loss(y_test, y_prob))

print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)


Evaluation Metrics:
Accuracy: 0.6005586592178771
Precision: 0.23513513513513515
Recall: 0.5386996904024768
F1: 0.3273753527751646
ROC-AUC: 0.602275868909613
Log Loss: 0.6848878170792528

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.61      0.72      1467
           1       0.24      0.54      0.33       323

    accuracy                           0.60      1790
   macro avg       0.55      0.58      0.52      1790
weighted avg       0.75      0.60      0.65      1790


Confusion Matrix:
 [[901 566]
 [149 174]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Logistic Regression Balanced": LogisticRegression(class_weight='balanced', max_iter=1000),
   # "K-Nearest Neighbors": KNeighborsClassifier(),
   # "Decision Tree": DecisionTreeClassifier(),
   # "Random Forest": RandomForestClassifier(),
   # "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
   # "CatBoost": CatBoostClassifier(verbose=False),
   # "AdaBoost": AdaBoostClassifier()
}

# Evaluation function
def evaluate_classification(y_true, y_pred, y_prob=None):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    return acc, prec, rec, f1, auc

model_list = []
test_acc_list = []

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Probabilities for AUC (if supported)
    try:
        y_train_prob = model.predict_proba(X_train)[:, 1]
        y_test_prob = model.predict_proba(X_test)[:, 1]
    except:
        y_train_prob = y_test_prob = None

    # Evaluate
    train_acc, train_prec, train_rec, train_f1, train_auc = evaluate_classification(y_train, y_train_pred, y_train_prob)
    test_acc, test_prec, test_rec, test_f1, test_auc = evaluate_classification(y_test, y_test_pred, y_test_prob)

    # Store model results
    model_list.append(name)
    test_acc_list.append(test_acc)
    
    # Print results
    print(f"ðŸ“Š {name}")
    print("Training Performance:")
    print(f"  - Accuracy: {train_acc:.4f}")
    print(f"  - Precision: {train_prec:.4f}")
    print(f"  - Recall: {train_rec:.4f}")
    print(f"  - F1 Score: {train_f1:.4f}")
    if train_auc: print(f"  - ROC-AUC: {train_auc:.4f}")
    print("----------------------------------")
    print("Test Performance:")
    print(f"  - Accuracy: {test_acc:.4f}")
    print(f"  - Precision: {test_prec:.4f}")
    print(f"  - Recall: {test_rec:.4f}")
    print(f"  - F1 Score: {test_f1:.4f}")
    if test_auc: print(f"  - ROC-AUC: {test_auc:.4f}")
    print("="*40 + "\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ðŸ“Š Logistic Regression
Training Performance:
  - Accuracy: 0.8192
  - Precision: 0.3333
  - Recall: 0.0008
  - F1 Score: 0.0015
  - ROC-AUC: 0.6201
----------------------------------
Test Performance:
  - Accuracy: 0.8179
  - Precision: 0.0000
  - Recall: 0.0000
  - F1 Score: 0.0000
  - ROC-AUC: 0.6181

ðŸ“Š Logistic Regression Balanced
Training Performance:
  - Accuracy: 0.5536
  - Precision: 0.2442
  - Recall: 0.7022
  - F1 Score: 0.3624
  - ROC-AUC: 0.6368
----------------------------------
Test Performance:
  - Accuracy: 0.5453
  - Precision: 0.2323
  - Recall: 0.6594
  - F1 Score: 0.3435
  - ROC-AUC: 0.6157



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
