In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif,SequentialFeatureSelector
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter



In [2]:

data = pd.read_csv('../staging/data_set/master_dataset.csv')

In [3]:
data.shape

(13417, 58)

In [4]:
data['arr_change'] = data['future_arr'] - data['current_arr']
data['perc_change'] = (data['future_arr'] - data['current_arr']) / data['current_arr']
data['churn_flag'] = (data['future_arr'] < data['current_arr'] * 0.8).astype(int)

In [5]:
data['churn_flag'].value_counts()

churn_flag
0    12769
1      648
Name: count, dtype: int64

In [6]:
data.columns

Index(['id', 'score_date', 'segment_smb', 'segment_non_smb',
       'segment_commercial', 'segment_enterprise', 'segment_midmarket',
       'region_emea', 'region_apac', 'region_latam', 'region_amer',
       'customer_age_quarters', 'sales_assisted', 'legacy_flag',
       'product_counts', 'billing_dunning_not_ok', 'crm_industry_current',
       'crm_employee_range', 'crm_success_owner_change', 'current_arr',
       'future_arr', 'arr_change', 'fx_impact', 'seat_change_arr',
       'product_change_arr', 'discount_arr_usd', 'team_plus', 'pro_plus',
       'ent_plus', 'discount_arr_usd_percentage', 'product_counts_percentage',
       'total_pool_max_agents', 'max_seats', 'agent_utilization',
       'is_provisioned_any_channel_M_before', 'max_seats_M_before',
       'agent_utilization_increase', 'agent_utilization_decrease',
       'seat_utilization', 'seat_utilization_M_before',
       'seat_utilization_increase', 'seat_utilization_decrease', 'csat_score',
       'csat_response_rate', 'n

Label Encoding

In [7]:
data = pd.get_dummies(data, columns=['crm_industry_current'])
data.columns = data.columns.str.replace(' ', '_')
data.drop(columns=['crm_industry_current'], errors='ignore', inplace=True)



In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['crm_employee_range'] = le.fit_transform(data['crm_employee_range'])


In [9]:
data.columns

Index(['id', 'score_date', 'segment_smb', 'segment_non_smb',
       'segment_commercial', 'segment_enterprise', 'segment_midmarket',
       'region_emea', 'region_apac', 'region_latam', 'region_amer',
       'customer_age_quarters', 'sales_assisted', 'legacy_flag',
       'product_counts', 'billing_dunning_not_ok', 'crm_employee_range',
       'crm_success_owner_change', 'current_arr', 'future_arr', 'arr_change',
       'fx_impact', 'seat_change_arr', 'product_change_arr',
       'discount_arr_usd', 'team_plus', 'pro_plus', 'ent_plus',
       'discount_arr_usd_percentage', 'product_counts_percentage',
       'total_pool_max_agents', 'max_seats', 'agent_utilization',
       'is_provisioned_any_channel_M_before', 'max_seats_M_before',
       'agent_utilization_increase', 'agent_utilization_decrease',
       'seat_utilization', 'seat_utilization_M_before',
       'seat_utilization_increase', 'seat_utilization_decrease', 'csat_score',
       'csat_response_rate', 'nps_score', 'nps_score_al

In [10]:
data.shape

(13417, 73)

Feature Grouping

In [11]:
data['seat_utilization_change'] = data['seat_utilization'] - data['seat_utilization_M_before']
data['max_seats_change'] = data['max_seats'] - data['max_seats_M_before']
data = data.drop(columns=['seat_utilization','seat_utilization_M_before','max_seats','max_seats_M_before'])

Feature Bucketing

In [12]:
X1 = data.drop(columns=[
    'id', 'score_date', 'perc_change', 
    'total_pool_max_agents', 'crm_success_owner_change', 
    'legacy_flag','year_week','crm_industry_current_0',
    'discount_arr_usd_percentage', 'product_counts_percentage',
    'discount_arr_usd'
    
], errors='ignore')

In [13]:
X1.shape

(13417, 60)

In [14]:
X1.columns

Index(['segment_smb', 'segment_non_smb', 'segment_commercial',
       'segment_enterprise', 'segment_midmarket', 'region_emea', 'region_apac',
       'region_latam', 'region_amer', 'customer_age_quarters',
       'sales_assisted', 'product_counts', 'billing_dunning_not_ok',
       'crm_employee_range', 'current_arr', 'future_arr', 'arr_change',
       'fx_impact', 'seat_change_arr', 'product_change_arr', 'team_plus',
       'pro_plus', 'ent_plus', 'agent_utilization',
       'is_provisioned_any_channel_M_before', 'agent_utilization_increase',
       'agent_utilization_decrease', 'seat_utilization_increase',
       'seat_utilization_decrease', 'csat_score', 'csat_response_rate',
       'nps_score', 'nps_score_all_time', 'avg_nps_rating',
       'avg_nps_rating_all_time', 'outreach_call_90dayflag',
       'num_low_sev_outages', 'num_high_sev_outages', 'has_high_sev_outage',
       'months_since_high_sev_outage', 'num_tickets_deflected',
       'max_tickets_per_agent', 'churn_flag',
     

In [15]:

# Define the target variable
y = X1['churn_flag']
# Define the feature variables (all columns except 'product_churn')
X = X1.drop(columns=['churn_flag']) 


# Check columns in X
print("Columns in X:")
print(X.columns)

# Check the name of the target column (y)
print("\nTarget column (y):")
print(y.name)

Columns in X:
Index(['segment_smb', 'segment_non_smb', 'segment_commercial',
       'segment_enterprise', 'segment_midmarket', 'region_emea', 'region_apac',
       'region_latam', 'region_amer', 'customer_age_quarters',
       'sales_assisted', 'product_counts', 'billing_dunning_not_ok',
       'crm_employee_range', 'current_arr', 'future_arr', 'arr_change',
       'fx_impact', 'seat_change_arr', 'product_change_arr', 'team_plus',
       'pro_plus', 'ent_plus', 'agent_utilization',
       'is_provisioned_any_channel_M_before', 'agent_utilization_increase',
       'agent_utilization_decrease', 'seat_utilization_increase',
       'seat_utilization_decrease', 'csat_score', 'csat_response_rate',
       'nps_score', 'nps_score_all_time', 'avg_nps_rating',
       'avg_nps_rating_all_time', 'outreach_call_90dayflag',
       'num_low_sev_outages', 'num_high_sev_outages', 'has_high_sev_outage',
       'months_since_high_sev_outage', 'num_tickets_deflected',
       'max_tickets_per_agent', 'crm_

In [16]:
import pandas as pd

# Perform feature selection
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

# Get the selected feature indices
selected_features = selector.get_support(indices=True)

# Get the names of the selected features (if X is a DataFrame)
if isinstance(X, pd.DataFrame):
    feature_names = X.columns[selected_features]
    print("Selected Features:")
    print(feature_names)
else:
    print("Selected feature indices:")
    print(selected_features)

# Output the transformed feature matrix
print("Transformed Feature Matrix:")
print(X_new)

Selected Features:
Index(['sales_assisted', 'current_arr', 'arr_change', 'seat_change_arr',
       'product_change_arr', 'pro_plus', 'ent_plus',
       'is_provisioned_any_channel_M_before', 'seat_utilization_decrease',
       'csat_score'],
      dtype='object')
Transformed Feature Matrix:
[[0.000e+00 6.000e+01 0.000e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 6.000e+01 0.000e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 6.000e+01 0.000e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 ...
 [0.000e+00 1.062e+04 0.000e+00 ... 1.300e+01 0.000e+00 0.000e+00]
 [0.000e+00 1.062e+04 0.000e+00 ... 1.300e+01 0.000e+00 0.000e+00]
 [0.000e+00 1.062e+04 0.000e+00 ... 1.300e+01 0.000e+00 0.000e+00]]


In [23]:
# import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Assuming X is a DataFrame and y is the target variable
estimator = RandomForestClassifier()
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y)

# Get the ranking of features
ranking = selector.ranking_

# If X is a DataFrame, get the feature names
if isinstance(X, pd.DataFrame):
    feature_names = X.columns
    selected_features = feature_names[selector.support_]  # Features selected by RFE
    print("Selected Features:")
    print(selected_features)

    # Display feature rankings
    feature_ranking = pd.DataFrame({'Feature': feature_names, 'Ranking': ranking})
    print("\nFeature Rankings:")
    print(feature_ranking.sort_values(by='Ranking'))
else:
    print("Feature rankings (indices):")
    print(ranking)






Selected Features:
Index(['current_arr', 'future_arr', 'arr_change', 'seat_change_arr',
       'product_change_arr'],
      dtype='object')

Feature Rankings:
                                              Feature  Ranking
15                                         future_arr        1
14                                        current_arr        1
18                                    seat_change_arr        1
16                                         arr_change        1
19                                 product_change_arr        1
39                       months_since_high_sev_outage        2
24                is_provisioned_any_channel_M_before        3
36                                num_low_sev_outages        4
41                              max_tickets_per_agent        5
17                                          fx_impact        6
9                               customer_age_quarters        7
30                                 csat_response_rate        8
11                    

In [None]:


# Assuming `estimator` is already defined (e.g., RandomForestClassifier)
sfs = SequentialFeatureSelector(estimator, n_features_to_select=25, direction='forward')
sfs.fit(X, y)

# Get the selected feature mask
selected_features_mask = sfs.get_support()

# If X is a DataFrame, get the feature names
if isinstance(X, pd.DataFrame):
    feature_names = X.columns.to_numpy()  # Ensure it's a NumPy array
    selected_features = feature_names[selected_features_mask]  # Selected features
    non_selected_features = feature_names[~selected_features_mask]  # Non-selected features

    print("Selected Features:")
    print(list(selected_features))

    print("\nNon-Selected Features:")
    print(list(non_selected_features))
else:
    print("Selected feature indices:")
    print([i for i, selected in enumerate(selected_features_mask) if selected])


In [None]:
# Apply SMOTE to balance the dataset
from imblearn.over_sampling import SMOTE  # Import SMOTE
from collections import Counter



smote = SMOTE(sampling_strategy='auto', random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Check new class distribution
print("Balanced class distribution:", Counter(y_balanced))

Balanced class distribution: Counter({0: 12769, 1: 12769})


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)
scaler = StandardScaler()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # ✅ Fixed: Apply StandardScaler correctly
X_test_scaled = scaler.transform(X_test) 

In [21]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predictions for Random Forest
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate Random Forest
print("\n🔹 Random Forest Model Evaluation 🔹")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


🔹 Random Forest Model Evaluation 🔹
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2554
           1       1.00      1.00      1.00      2554

    accuracy                           1.00      5108
   macro avg       1.00      1.00      1.00      5108
weighted avg       1.00      1.00      1.00      5108



In [22]:
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Predictions for Logistic Regression
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluate Logistic Regression
print("\n🔹 Logistic Regression Model Evaluation 🔹")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))


🔹 Logistic Regression Model Evaluation 🔹
Accuracy: 0.9363743148003132
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      2554
           1       0.95      0.92      0.94      2554

    accuracy                           0.94      5108
   macro avg       0.94      0.94      0.94      5108
weighted avg       0.94      0.94      0.94      5108

