In [34]:
print("="*70)
print("Model Building with Proper Preprocessing and Feature Engineering")
print("="*70)

Model Building with Proper Preprocessing and Feature Engineering


In [35]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')


In [36]:
print("="*70)
print("\n Loading Data")
print("-"*70)

df = pd.read_csv('../data/cleaned_sales_leads_dataset.csv')
print(f" Loaded {len(df)} leads")
print(f" Converted: {df['converted'].sum()} ({df['converted'].mean()*100:.2f}%)")
print(f" Not Converted: {len(df) - df['converted'].sum()} ({(1-df['converted'].mean())*100:.2f}%)")


 Loading Data
----------------------------------------------------------------------
 Loaded 1000 leads
 Converted: 818 (81.80%)
 Not Converted: 182 (18.20%)


In [37]:
df.head()

Unnamed: 0,lead_id,company_size,industry,annual_revenue_lkr,location,engagement_score,website_visits,email_opens,demo_requested,days_since_first_contact,contact_level,budget_indicated_lkr,competitor_using,referral_source,converted
0,LEAD_0001,Medium,Agriculture,116701433,Kandy,87,25,16,No,15,Manager,4724984,Yes,Website,1
1,LEAD_0002,Medium,Finance,142242154,Kurunegala,77,45,4,No,105,Employee,5301443,No,Cold Call,1
2,LEAD_0003,Medium,IT/Software,134980432,Kurunegala,100,27,7,Yes,160,Manager,4643608,Yes,Referral,1
3,LEAD_0004,Small,Tourism,7239690,Colombo,100,33,11,No,166,C-Level,196625,No,Referral,1
4,LEAD_0005,Medium,IT/Software,128031305,Jaffna,48,17,3,No,116,Employee,4773010,No,Referral,0


In [38]:
# Display Basic Information
print(f"\n Dataset Overview:")
print(df.info())


 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   lead_id                   1000 non-null   object
 1   company_size              1000 non-null   object
 2   industry                  1000 non-null   object
 3   annual_revenue_lkr        1000 non-null   int64 
 4   location                  1000 non-null   object
 5   engagement_score          1000 non-null   int64 
 6   website_visits            1000 non-null   int64 
 7   email_opens               1000 non-null   int64 
 8   demo_requested            1000 non-null   object
 9   days_since_first_contact  1000 non-null   int64 
 10  contact_level             1000 non-null   object
 11  budget_indicated_lkr      1000 non-null   int64 
 12  competitor_using          1000 non-null   object
 13  referral_source           1000 non-null   object
 14  conve

In [39]:
df

Unnamed: 0,lead_id,company_size,industry,annual_revenue_lkr,location,engagement_score,website_visits,email_opens,demo_requested,days_since_first_contact,contact_level,budget_indicated_lkr,competitor_using,referral_source,converted
0,LEAD_0001,Medium,Agriculture,116701433,Kandy,87,25,16,No,15,Manager,4724984,Yes,Website,1
1,LEAD_0002,Medium,Finance,142242154,Kurunegala,77,45,4,No,105,Employee,5301443,No,Cold Call,1
2,LEAD_0003,Medium,IT/Software,134980432,Kurunegala,100,27,7,Yes,160,Manager,4643608,Yes,Referral,1
3,LEAD_0004,Small,Tourism,7239690,Colombo,100,33,11,No,166,C-Level,196625,No,Referral,1
4,LEAD_0005,Medium,IT/Software,128031305,Jaffna,48,17,3,No,116,Employee,4773010,No,Referral,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,LEAD_0996,Large,IT/Software,276962526,Jaffna,83,2,10,Yes,120,C-Level,20743136,No,Trade Show,1
996,LEAD_0997,Medium,Education,35128715,Jaffna,93,7,19,Yes,5,Employee,883258,Yes,Trade Show,1
997,LEAD_0998,Large,Education,680531071,Negombo,100,34,7,Yes,174,Manager,30069693,Yes,Cold Call,1
998,LEAD_0999,Medium,Healthcare,93502587,Kandy,94,11,17,Yes,86,Employee,3608823,Yes,Trade Show,1


In [43]:
# Feature Engineering

print("\n" + "="*70)
print("Feature Engineering")
print("="*70)

df_processed = df.copy()

# 1. Engagement Intensity Score
df_processed['engagement_demo'] = df_processed['engagement_score'] * df_processed['demo_requested'].map({'Yes': 1, 'No': 0})
print("Created: engagement_demo (engagement × demo interaction)")

# 2. Budget to Revenue Ratio
df_processed['budget_revenue_ratio'] = df_processed['budget_indicated_lkr'] / (df_processed['annual_revenue_lkr'] + 1)
print("Created: budget_revenue_ratio (budget/revenue)")

# 3. Total Engagement Score
df_processed['total_engagement'] = df_processed['website_visits'] + (df_processed['email_opens'] * 2)
print("Created: total_engagement (visits + 2×emails)")

# 4. Contact Quality Score
contact_scores = {'Employee': 30, 'Manager': 60, 'C-Level': 100}
df_processed['contact_quality'] = df_processed['contact_level'].map(contact_scores)
print("Created: contact_quality (Employee:30, Manager:60, C-Level:100)")

# 5. Demo Requested Flag
df_processed['demo_flag'] = df_processed['demo_requested'].map({'Yes': 1, 'No': 0})
print("Created: demo_flag (Yes:1, No:0)")

# 6. Competitor Flag
df_processed['competitor_flag'] = df_processed['competitor_using'].map({'Yes': 1, 'No': 0})
print("Created: competitor_flag (Yes:1, No:0)")

print(f"\n Total features after engineering: {df_processed.shape[1]}")



Feature Engineering
Created: engagement_demo (engagement × demo interaction)
Created: budget_revenue_ratio (budget/revenue)
Created: total_engagement (visits + 2×emails)
Created: contact_quality (Employee:30, Manager:60, C-Level:100)
Created: demo_flag (Yes:1, No:0)
Created: competitor_flag (Yes:1, No:0)

 Total features after engineering: 21


In [44]:
# Encoding Categorical Variables

print("\n" + "="*70)
print("Encoding Categorical Variables")
print("-"*70)

# Initialize label encoders dictionary
label_encoders = {}
categorical_cols = ['company_size', 'industry', 'location', 'contact_level', 'referral_source']

for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])
    label_encoders[col] = le
    print(f"Encoded: {col} ({len(le.classes_)} categories)")


Encoding Categorical Variables
----------------------------------------------------------------------
Encoded: company_size (3 categories)
Encoded: industry (8 categories)
Encoded: location (8 categories)
Encoded: contact_level (3 categories)
Encoded: referral_source (5 categories)


In [45]:
# Prepare Features for Modeling

print("\n" + "="*70)
print("Preparing Features")
print("-"*70)

# Select features for modeling
feature_cols = [
    'company_size_encoded', 'industry_encoded', 'location_encoded',
    'engagement_score', 'website_visits', 'email_opens', 'demo_flag',
    'days_since_first_contact', 'contact_level_encoded', 'budget_indicated_lkr',
    'annual_revenue_lkr', 'competitor_flag', 'referral_source_encoded',
    'engagement_demo', 'budget_revenue_ratio', 'total_engagement', 'contact_quality'
]

X = df_processed[feature_cols]
y = df_processed['converted']

print(f" Features selected: {len(feature_cols)} columns")
print(f" Target variable: converted")
print(f"\n Class Distribution:")
print(f"   Converted (1): {y.sum()} ({y.mean()*100:.2f}%)")
print(f"   Not Converted (0): {len(y) - y.sum()} ({(1-y.mean())*100:.2f}%)")


Preparing Features
----------------------------------------------------------------------
 Features selected: 17 columns
 Target variable: converted

 Class Distribution:
   Converted (1): 818 (81.80%)
   Not Converted (0): 182 (18.20%)


In [46]:
# Train-Test Split

print("\n" + "="*70)
print("Creating Train-Test Split")
print("-"*70)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

print(f" Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f" Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")


Creating Train-Test Split
----------------------------------------------------------------------
 Training set: 700 samples (70.0%)
 Test set: 300 samples (30.0%)


In [47]:
# Feature Scaling

print("\n" + "="*70)
print("Scaling Features")
print("-"*70)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(" Features scaled using StandardScaler")
print(f"   Mean: ~0, Std: ~1")


Scaling Features
----------------------------------------------------------------------
 Features scaled using StandardScaler
   Mean: ~0, Std: ~1


In [48]:
!pip install imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable


In [50]:
# Handle Class Imbalance with SMOTE

print("\n" + "="*70)
print("Balancing Classes with SMOTE")
print("-"*70)

print(f"Before SMOTE:")
print(f"   Class 0: {(y_train == 0).sum()}")
print(f"   Class 1: {(y_train == 1).sum()}")

smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"\nAfter SMOTE:")
print(f"   Class 0: {(y_train_balanced == 0).sum()}")
print(f"   Class 1: {(y_train_balanced == 1).sum()}")
print(f"✅ Total samples: {len(X_train_balanced)}")


Balancing Classes with SMOTE
----------------------------------------------------------------------
Before SMOTE:
   Class 0: 127
   Class 1: 573

After SMOTE:
   Class 0: 286
   Class 1: 573
✅ Total samples: 859


In [51]:
# Build and train both models (data is already encoded, so no preprocessing needed)

print("\n" + "="*70)
print("Training Models")
print("-"*70)

# 1. Logistic Regression Model
print("\n Training Logistic Regression...")
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train_balanced, y_train_balanced)
print(" Logistic Regression trained")

# 2. Decision Tree Model
print("\n Training Decision Tree...")
dt_model = DecisionTreeClassifier(
    class_weight='balanced',
    max_depth=5,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42
)
dt_model.fit(X_train_balanced, y_train_balanced)
print(" Decision Tree trained")

print("Both models have been trained successfully.")


Training Models
----------------------------------------------------------------------

 Training Logistic Regression...
 Logistic Regression trained

 Training Decision Tree...
 Decision Tree trained
Both models have been trained successfully.


In [52]:
# Comparison and Evaluation

print("\n" + "="*70)
print("Model Evaluation")
print("-"*70)

# Logistic Regression predictions
log_preds = log_reg_model.predict(X_test_scaled)
log_proba = log_reg_model.predict_proba(X_test_scaled)[:, 1]
log_acc = accuracy_score(y_test, log_preds)
log_auc = roc_auc_score(y_test, log_proba)

print("\n LOGISTIC REGRESSION RESULTS:")
print(f"   Accuracy: {log_acc:.2%}")
print(f"   ROC-AUC: {log_auc:.4f}")
print("\n" + classification_report(y_test, log_preds))

# Decision Tree Predictions
tree_preds = dt_model.predict(X_test_scaled)
tree_proba = dt_model.predict_proba(X_test_scaled)[:, 1]
tree_acc = accuracy_score(y_test, tree_preds)
tree_auc = roc_auc_score(y_test, tree_proba)

print("\n DECISION TREE RESULTS:")
print(f"   Accuracy: {tree_acc:.2%}")
print(f"   ROC-AUC: {tree_auc:.4f}")
print("\n" + classification_report(y_test, tree_preds))


Model Evaluation
----------------------------------------------------------------------

 LOGISTIC REGRESSION RESULTS:
   Accuracy: 82.00%
   ROC-AUC: 0.8132

              precision    recall  f1-score   support

           0       0.51      0.47      0.49        55
           1       0.88      0.90      0.89       245

    accuracy                           0.82       300
   macro avg       0.70      0.69      0.69       300
weighted avg       0.82      0.82      0.82       300


 DECISION TREE RESULTS:
   Accuracy: 66.67%
   ROC-AUC: 0.7035

              precision    recall  f1-score   support

           0       0.31      0.69      0.43        55
           1       0.91      0.66      0.76       245

    accuracy                           0.67       300
   macro avg       0.61      0.68      0.60       300
weighted avg       0.80      0.67      0.70       300



In [53]:
# Selection and Saving the Best Model

print("\n" + "="*70)
print("Selecting Best Model")
print("-"*70)

# Selection logic
if tree_auc > log_auc:
    best_model = dt_model
    model_name = "Decision Tree"
    best_acc = tree_acc
    best_auc = tree_auc
else:
    best_model = log_reg_model
    model_name = "Logistic Regression"
    best_acc = log_acc
    best_auc = log_auc

print(f"\n + Winning Model: {model_name}")
print(f"   Accuracy: {best_acc:.2%}")
print(f"   ROC-AUC: {best_auc:.4f}")



Selecting Best Model
----------------------------------------------------------------------

 + Winning Model: Logistic Regression
   Accuracy: 82.00%
   ROC-AUC: 0.8132


In [54]:
# Save Models and Preprocessing Objects

("\n" + "="*70)
print("Saving Models and Preprocessing Objects")
print("-"*70)

# Save model 
joblib.dump(best_model, '../models/best_lead_scoring_model.pkl')
print(" Saved: best_lead_scoring_model.pkl")

# Save scaler
joblib.dump(scaler, '../models/scaler.pkl')
print(" Saved: scaler.pkl")

# Save label encoders
joblib.dump(label_encoders, '../models/label_encoders.pkl')
print(" Saved: label_encoders.pkl")

# Save feature columns list
joblib.dump(feature_cols, '../models/feature_columns.pkl')
print(" Saved: feature_columns.pkl")

print("\n" + "="*70)
print(" MODEL BUILDING COMPLETED SUCCESSFULLY!")
print("="*70)

print(f"\n Files Created:")
print(f"   • models/best_lead_scoring_model.pkl ({model_name})")
print(f"   • models/scaler.pkl")
print(f"   • models/label_encoders.pkl")
print(f"   • models/feature_columns.pkl")

print(f"\n Next Steps:")
print(f"   1. Run notebook 03 for lead scoring")
print(f"   2. Feature engineering will be applied automatically")
print(f"   3. All preprocessing objects are saved and ready!")

Saving Models and Preprocessing Objects
----------------------------------------------------------------------
 Saved: best_lead_scoring_model.pkl
 Saved: scaler.pkl
 Saved: label_encoders.pkl
 Saved: feature_columns.pkl

 MODEL BUILDING COMPLETED SUCCESSFULLY!

 Files Created:
   • models/best_lead_scoring_model.pkl (Logistic Regression)
   • models/scaler.pkl
   • models/label_encoders.pkl
   • models/feature_columns.pkl

 Next Steps:
   1. Run notebook 03 for lead scoring
   2. Feature engineering will be applied automatically
   3. All preprocessing objects are saved and ready!
