In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import numpy as np
import joblib

# ✅ Load the synthetic data
file_path = 'lead_scoring_synthetic.csv'  # Adjust path if needed
data = pd.read_csv(file_path)

# ✅ Drop unnecessary columns
data_cleaned = data.copy()

# ✅ Fill missing values using .loc (Future-proof for pandas 3.0)
for col in data_cleaned.columns:
    if data_cleaned[col].dtype == 'object':
        data_cleaned.loc[:, col] = data_cleaned[col].fillna('Unknown')
    else:
        data_cleaned.loc[:, col] = data_cleaned[col].fillna(data_cleaned[col].mean())

# ✅ Encode categorical columns using Label Encoding
label_encoder = LabelEncoder()
for col in data_cleaned.columns:
    if data_cleaned[col].dtype == 'object':
        data_cleaned.loc[:, col] = label_encoder.fit_transform(data_cleaned[col])

# ✅ Split data into features and target
X = data_cleaned.drop(['Converted', 'User ID'], axis=1)
y = data_cleaned['Converted']
user_ids = data_cleaned['User ID']  # Save User ID for reference

# ✅ Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ✅ Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test, user_ids_train, user_ids_test = train_test_split(
    X_scaled, y, user_ids, test_size=0.2, random_state=42
)

# ✅ Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ✅ Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# ✅ Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
report = classification_report(y_test, y_pred)

print(f"\n✅ Accuracy: {accuracy * 100:.2f}%")
print(f"✅ ROC-AUC Score: {roc_auc:.2f}")
print("\n✅ Classification Report:\n", report)

# ✅ Extract Feature Importance
feature_importance = model.feature_importances_
features = X.columns

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# ✅ Normalize importance to get feature weights
importance_df['Weight'] = (importance_df['Importance'] / importance_df['Importance'].sum()) * 100

# ✅ Define the Scoring Function
def calculate_score(probability, feature_values):
    base_score = probability * 100
    feature_contribution = np.dot(feature_values, importance_df['Weight'].values)
    final_score = min(100, max(0, base_score + feature_contribution))
    return final_score

# ✅ Generate Scores for Test Set
test_probabilities = model.predict_proba(X_test)[:, 1]
test_features = X_test
scores = [calculate_score(prob, features) for prob, features in zip(test_probabilities, test_features)]

# ✅ Attach Scores and User ID to Results
results = pd.DataFrame(X_test, columns=features)
results['User ID'] = user_ids_test.values
results['Probability'] = test_probabilities
results['Lead_Score'] = scores

# ✅ Define Lead Quality
def lead_quality(score):
    if score >= 70:
        return 'High'
    elif score >= 40:
        return 'Medium'
    else:
        return 'Low'

results['Lead_Quality'] = results['Lead_Score'].apply(lead_quality)

# ✅ Print Results in Terminal (ALL RECORDS)
print("\n🔍 LEAD SCORING RESULTS:")
print(results[['User ID', 'Lead_Score', 'Lead_Quality']].to_string(index=False))

# ✅ Print Top 5 Important Features
print("\n📌 TOP 5 IMPORTANT FEATURES:")
print(importance_df.head().to_string(index=False))

# ✅ Save the Model and Encoders
joblib.dump(model, 'lead_scoring_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

# ✅ Save Full Results to CSV
results.to_csv('lead_scoring_results.csv', index=False)

print("\n✅ Lead scoring results saved to 'lead_scoring_results.csv'")



✅ Accuracy: 100.00%
✅ ROC-AUC Score: 1.00

✅ Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        67
           1       1.00      1.00      1.00        33

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100


🔍 LEAD SCORING RESULTS:
User ID  Lead_Score Lead_Quality
    127    0.000000          Low
     59   17.659649          Low
     91   59.931364       Medium
    430   73.927262         High
    446    0.000000          Low
     24   53.934530       Medium
    121    0.000000          Low
    169    3.923319          Low
    124    0.000000          Low
    329   50.415906       Medium
     34    0.000000          Low
    429    0.000000          Low
     88   15.516322          Low
    173  100.000000         High
    286  100.000000         High
      1  100.000000         High
    338   91.9755