In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score

In [72]:
df = pd.read_csv("expresso_processed.csv")
df

Unnamed: 0.1,Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,MRG,REGULARITY,CHURN
0,0,K > 24 month,3000.0,4.0,3002.0,1001.0,8.0,266.0,1.0,NO,32,1
1,1,K > 24 month,1500.0,3.0,1980.0,660.0,5.0,4341.0,1.0,NO,29,1
2,2,K > 24 month,9850.0,32.0,9639.0,3213.0,38.0,2700.0,4.0,NO,49,1
3,3,I 18-21 month,1000.0,2.0,999.0,333.0,2.0,763.0,8.0,NO,1,1
4,4,K > 24 month,1200.0,3.0,1180.0,393.0,3.0,2955.0,28.0,NO,21,1
...,...,...,...,...,...,...,...,...,...,...,...,...
64371,34995,K > 24 month,2300.0,10.0,2224.0,741.0,25.0,1441.0,3.0,NO,62,0
64372,34996,K > 24 month,10800.0,27.0,10801.0,3600.0,26.0,11460.0,25.0,NO,45,0
64373,34997,K > 24 month,4000.0,4.0,4007.0,1336.0,5.0,11873.0,3.0,NO,50,0
64374,34998,K > 24 month,13600.0,28.0,14600.0,4867.0,37.0,3676.0,291.0,NO,62,0


In [73]:
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

In [74]:
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(exclude=['object']).columns

In [75]:
categorical_cols = ["TENURE", "MRG"]

# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert categories to numbers
    label_encoders[col] = le

In [76]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col]) 
    label_encoders[col] = le

In [77]:
X = df.drop(columns=['CHURN'])
y = df['CHURN']  

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
dt_model = DecisionTreeClassifier(random_state=42)

In [80]:
param_grid = {
    'max_depth': [3, 5, 10, None], 
    'min_samples_split': [2, 5, 10],  
    'criterion': ['gini', 'entropy'] 
}

In [81]:
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [82]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

In [83]:
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

In [84]:
print(f"Best Hyperparameters: {best_params}")
print(f"Best Model Accuracy: {accuracy_best:.2f}")

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2}
Best Model Accuracy: 0.77


In [85]:
import joblib
joblib.dump(best_model, "tuned_decision_tree.pkl")
print("Tuned model saved successfully!")


Tuned model saved successfully!


In [86]:
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']

In [87]:
print("Training feature order:", list(X_train.columns))


Training feature order: ['TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'MRG', 'REGULARITY']
