In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif


In [2]:

data = pd.read_csv('../staging/data_set/master_dataset.csv')

In [3]:
data['arr_change'] = data['future_arr'] - data['current_arr']
data['perc_change'] = (data['future_arr'] - data['current_arr']) / data['current_arr']
data['churn_flag'] = (data['future_arr'] < data['current_arr'] * 0.8).astype(int)

In [4]:
data['churn_flag'].value_counts()

churn_flag
0    12769
1      648
Name: count, dtype: int64

In [5]:
data.columns

Index(['id', 'score_date', 'segment_smb', 'segment_non_smb',
       'segment_commercial', 'segment_enterprise', 'segment_midmarket',
       'region_emea', 'region_apac', 'region_latam', 'region_amer',
       'customer_age_quarters', 'sales_assisted', 'legacy_flag',
       'product_counts', 'billing_dunning_not_ok', 'crm_industry_current',
       'crm_employee_range', 'crm_success_owner_change', 'current_arr',
       'future_arr', 'arr_change', 'fx_impact', 'seat_change_arr',
       'product_change_arr', 'discount_arr_usd', 'team_plus', 'pro_plus',
       'ent_plus', 'discount_arr_usd_percentage', 'product_counts_percentage',
       'total_pool_max_agents', 'max_seats', 'agent_utilization',
       'is_provisioned_any_channel_M_before', 'max_seats_M_before',
       'agent_utilization_increase', 'agent_utilization_decrease',
       'seat_utilization', 'seat_utilization_M_before',
       'seat_utilization_increase', 'seat_utilization_decrease', 'csat_score',
       'csat_response_rate', 'n

Label Encoding

In [6]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
le_industry = LabelEncoder()
le_employee = LabelEncoder()

# Fit and transform the columns
data['crm_industry_current'] = le_industry.fit_transform(data['crm_industry_current'])
data['crm_employee_range'] = le_employee.fit_transform(data['crm_employee_range'])

# Print the mapping of values to integers for each column
print("Mapping for 'crm_industry_current':")
print(dict(zip(le_industry.classes_, range(len(le_industry.classes_)))))

print("\nMapping for 'crm_employee_range':")
print(dict(zip(le_employee.classes_, range(len(le_employee.classes_)))))

Mapping for 'crm_industry_current':
{'0': 0, 'Consumer Services': 1, 'Corporate Services': 2, 'Education': 3, 'Energy & Utilities': 4, 'Financial Services': 5, 'Government': 6, 'Healthcare': 7, 'Manufacturing & Resources': 8, 'Media & Entertainment': 9, 'Non-Profits & Associations': 10, 'Retail, Wholesale & Distribution': 11, 'Technology': 12, 'Telecommunications': 13, 'Transportation': 14, 'Travel & Hospitality': 15}

Mapping for 'crm_employee_range':
{'01-Sep': 0, '100 - 249': 1, '1000 - 4999': 2, '250 - 499': 3, '50 - 99': 4, '500 - 999': 5, '5000+': 6, 'Oct-49': 7}


Feature Grouping

In [None]:
data['seat_utilization_change'] = data['seat_utilization'] - data['seat_utilization_M_before']
data['max_seats_change'] = data['max_seats'] - data['max_seats_M_before']
data['arr_changes'] = data['future_arr'] - data['current_arr']
data = data.drop(columns=['seat_utilization','seat_utilization_M_before','max_seats','max_seats_M_before','future_arr','current_arr'])

Feature Bucketing

In [8]:
# Combine the columns into a single column
def combine_columns(row):
    if row['region_emea'] != 0:
        return 1
    elif row['region_apac'] != 0:
        return 2
    elif row['region_latam'] != 0:
        return 3
    elif row['region_amer'] != 0:
        return 4
    else:
        return 0  # Default value if all are 0

# Apply the function to create the 'regions' column
data['regions'] = data.apply(combine_columns, axis=1)

# Drop the original region columns if no longer needed
data = data.drop(columns=['region_emea', 'region_apac', 'region_latam', 'region_amer'])

# Display the updated DataFrame
print(data)

                  id  score_date  segment_smb  segment_non_smb  \
0      JMAN_10025258  2023-01-06          0.0              1.0   
1      JMAN_10025258  2023-01-13          0.0              1.0   
2      JMAN_10025258  2023-01-20          0.0              1.0   
3      JMAN_10025258  2023-01-27          0.0              1.0   
4      JMAN_10025258  2023-02-03          0.0              1.0   
...              ...         ...          ...              ...   
13412   JMAN_9999715  2023-03-03          0.0              1.0   
13413   JMAN_9999715  2023-03-10          0.0              1.0   
13414   JMAN_9999715  2023-03-17          0.0              1.0   
13415   JMAN_9999715  2023-03-24          0.0              1.0   
13416   JMAN_9999715  2023-03-31          0.0              1.0   

       segment_commercial  segment_enterprise  segment_midmarket  \
0                     0.0                 0.0                0.0   
1                     0.0                 0.0                0.0   
2  

In [None]:
# Remove the 'id' column from X
X1 = data.drop(columns=['id' , 'score_date' , 'perc_change','arr_change','total_pool_max_agents','crm_success_owner_change','legacy_flag','crm_success_owner_change'], errors='ignore')  # Use 'errors="ignore"' to avoid errors if 'id' is not present



In [13]:
X = X1.iloc[:, :-1]  
y = X1.iloc[:, -1] 
# Check columns in X
print("Columns in X:")
print(X.columns)

# Check the name of the target column (y)
print("\nTarget column (y):")
print(y.name)

Columns in X:
Index(['segment_smb', 'segment_non_smb', 'segment_commercial',
       'segment_enterprise', 'segment_midmarket', 'region_emea', 'region_apac',
       'region_latam', 'region_amer', 'customer_age_quarters',
       'sales_assisted', 'legacy_flag', 'product_counts',
       'billing_dunning_not_ok', 'crm_industry_current', 'crm_employee_range',
       'crm_success_owner_change', 'current_arr', 'future_arr', 'arr_change',
       'fx_impact', 'seat_change_arr', 'product_change_arr',
       'discount_arr_usd', 'team_plus', 'pro_plus', 'ent_plus',
       'discount_arr_usd_percentage', 'product_counts_percentage',
       'total_pool_max_agents', 'max_seats', 'agent_utilization',
       'is_provisioned_any_channel_M_before', 'max_seats_M_before',
       'agent_utilization_increase', 'agent_utilization_decrease',
       'seat_utilization', 'seat_utilization_M_before',
       'seat_utilization_increase', 'seat_utilization_decrease', 'csat_score',
       'csat_response_rate', 'nps_sco

In [14]:
import pandas as pd

# Perform feature selection
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

# Get the selected feature indices
selected_features = selector.get_support(indices=True)

# Get the names of the selected features (if X is a DataFrame)
if isinstance(X, pd.DataFrame):
    feature_names = X.columns[selected_features]
    print("Selected Features:")
    print(feature_names)
else:
    print("Selected feature indices:")
    print(selected_features)

# Output the transformed feature matrix
print("Transformed Feature Matrix:")
print(X_new)

Selected Features:
Index(['sales_assisted', 'current_arr', 'arr_change', 'seat_change_arr',
       'product_change_arr', 'pro_plus', 'max_seats',
       'is_provisioned_any_channel_M_before', 'max_seats_M_before',
       'seat_utilization_decrease'],
      dtype='object')
Transformed Feature Matrix:
[[1.00000000e+00 2.97600000e+03 0.00000000e+00 ... 2.00000000e+00
  2.00000000e+00 0.00000000e+00]
 [1.00000000e+00 2.07247200e+04 7.50120000e+02 ... 9.00000000e+00
  9.00000000e+00 0.00000000e+00]
 [0.00000000e+00 6.05760000e+02 2.12040000e+02 ... 1.00000000e+00
  1.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 2.28000000e+02 0.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [0.00000000e+00 2.28000000e+02 0.00000000e+00 ... 8.78967286e+00
  1.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.08000000e+03 0.00000000e+00 ... 8.00000000e+00
  1.00000000e+01 0.00000000e+00]]


  f = msb / msw


In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Assuming X is a DataFrame and y is the target variable
estimator = RandomForestClassifier()
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y)

# Get the ranking of features
ranking = selector.ranking_

# If X is a DataFrame, get the feature names
if isinstance(X, pd.DataFrame):
    feature_names = X.columns
    selected_features = feature_names[selector.support_]  # Features selected by RFE
    print("Selected Features:")
    print(selected_features)

    # Display feature rankings
    feature_ranking = pd.DataFrame({'Feature': feature_names, 'Ranking': ranking})
    print("\nFeature Rankings:")
    print(feature_ranking.sort_values(by='Ranking'))
else:
    print("Feature rankings (indices):")
    print(ranking)






Selected Features:
Index(['current_arr', 'future_arr', 'arr_change', 'seat_change_arr',
       'product_change_arr'],
      dtype='object')

Feature Rankings:
                                Feature  Ranking
21                      seat_change_arr        1
19                           arr_change        1
18                           future_arr        1
22                   product_change_arr        1
17                          current_arr        1
50         months_since_high_sev_outage        2
33                   max_seats_M_before        3
30                            max_seats        4
32  is_provisioned_any_channel_M_before        5
36                     seat_utilization        6
20                            fx_impact        7
52                max_tickets_per_agent        8
9                 customer_age_quarters        9
14                 crm_industry_current       10
47                  num_low_sev_outages       11
37            seat_utilization_M_before       12
15      

In [19]:
from sklearn.feature_selection import SequentialFeatureSelector
import pandas as pd

# Assuming `estimator` is already defined (e.g., RandomForestClassifier)
sfs = SequentialFeatureSelector(estimator, n_features_to_select=25, direction='forward')
sfs.fit(X, y)

# Get the selected feature mask
selected_features_mask = sfs.get_support()

# If X is a DataFrame, get the feature names
if isinstance(X, pd.DataFrame):
    feature_names = X.columns
    selected_features = feature_names[selected_features_mask]  # Selected features
    non_selected_features = feature_names[~selected_features_mask]  # Non-selected features

    print("Selected Features:")
    print(selected_features)

    print("\nNon-Selected Features:")
    print(non_selected_features)
else:
    print("Selected feature indices:")
    print([i for i, selected in enumerate(selected_features_mask) if selected])

Selected Features:
Index(['segment_smb', 'segment_non_smb', 'segment_commercial',
       'segment_enterprise', 'segment_midmarket', 'region_emea', 'region_apac',
       'region_latam', 'region_amer', 'customer_age_quarters',
       'sales_assisted', 'legacy_flag', 'product_counts',
       'billing_dunning_not_ok', 'crm_industry_current', 'crm_employee_range',
       'crm_success_owner_change', 'current_arr', 'future_arr', 'arr_change',
       'fx_impact', 'seat_change_arr', 'product_change_arr',
       'discount_arr_usd', 'max_seats_M_before'],
      dtype='object')

Non-Selected Features:
Index(['team_plus', 'pro_plus', 'ent_plus', 'discount_arr_usd_percentage',
       'product_counts_percentage', 'total_pool_max_agents', 'max_seats',
       'agent_utilization', 'is_provisioned_any_channel_M_before',
       'agent_utilization_increase', 'agent_utilization_decrease',
       'seat_utilization', 'seat_utilization_M_before',
       'seat_utilization_increase', 'seat_utilization_decrease',