In [5]:
# Train and compare Logistic Regression vs. Decision Trees

In [11]:
!pip3 install joblib                           
from pkg_resources import Requirement
Requirement.parse("joblib")



Requirement.parse('joblib')

In [15]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.17.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp312-cp312-win_amd64.whl (8.0 MB)
Using cached scipy-1.17.0-cp312-cp312-win_amd64.whl (36.3 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn

   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   -------------

In [19]:
#Imports and Load Data
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load cleaned data
df = pd.read_csv('../data/cleaned_sales_leads_dataset.csv')

# Drop lead_id as it is not a predictive feature
X = df.drop(columns=['lead_id', 'converted'])
y = df['converted']

# Split data: 80% Training, 20% Testing [cite: 100]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
#Setup Preprocessing Pipeline

# Identify columns based on your dataset structure [cite: 60]
numerical_cols = ['annual_revenue_lkr', 'engagement_score', 'website_visits', 
                  'email_opens', 'days_since_first_contact', 'budget_indicated_lkr']

categorical_cols = ['company_size', 'industry', 'location', 'demo_requested', 
                    'contact_level', 'competitor_using', 'referral_source']

# Create transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [21]:
#Build Both Models

# 1. Logistic Regression Model [cite: 79]
log_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# 2. Decision Tree Model [cite: 80]
dec_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=5, random_state=42))
])

# Train both
log_reg_pipeline.fit(X_train, y_train)
dec_tree_pipeline.fit(X_train, y_train)

print("Both models have been trained successfully.")

Both models have been trained successfully.


In [22]:
#Comparison and Evaluation

# Get predictions
log_preds = log_reg_pipeline.predict(X_test)
tree_preds = dec_tree_pipeline.predict(X_test)

# Calculate Accuracy
log_acc = accuracy_score(y_test, log_preds)
tree_acc = accuracy_score(y_test, tree_preds)

print(f"Logistic Regression Accuracy: {log_acc:.2%}")
print(f"Decision Tree Accuracy: {tree_acc:.2%}")

# Show detailed reports to see which model hits the 75-80% goal 
print("\nDetailed Decision Tree Report:")
print(classification_report(y_test, tree_preds))

Logistic Regression Accuracy: 80.50%
Decision Tree Accuracy: 83.00%

Detailed Decision Tree Report:
              precision    recall  f1-score   support

           0       0.50      0.12      0.19        34
           1       0.84      0.98      0.91       166

    accuracy                           0.83       200
   macro avg       0.67      0.55      0.55       200
weighted avg       0.79      0.83      0.78       200



In [26]:
#selection and Saving the Best Model
# Selection logic
if log_acc > tree_acc:
    best_model = log_reg_pipeline
    model_name = "Logistic Regression"
else:
    best_model = dec_tree_pipeline
    model_name = "Decision Tree"

print(f"Winning Model: {model_name}")

# Save the best model for the scoring system [cite: 51]
joblib.dump(best_model, '../models/best_lead_scoring_model.pkl')
print(f"The {model_name} model has been saved as 'best_lead_scoring_model.pkl'")

Winning Model: Decision Tree
The Decision Tree model has been saved as 'best_lead_scoring_model.pkl'
