In [380]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn, sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from typing import Tuple
from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from joblib import dump, load

import warnings
warnings.filterwarnings("ignore")

In [381]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
churn = fetch_ucirepo(id=563) 
  
# data (as pandas dataframes) 
X = churn.data.features 
y = churn.data.targets 
  
# metadata 
print(churn.metadata) 
  
# variable information 
print(churn.variables) 

{'uci_id': 563, 'name': 'Iranian Churn', 'repository_url': 'https://archive.ics.uci.edu/dataset/563/iranian+churn+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/563/data.csv', 'abstract': "This dataset is randomly collected from an Iranian telecom company's database over a period of 12 months.", 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 3150, 'num_features': 13, 'feature_types': ['Integer'], 'demographics': ['Age'], 'target_col': ['Churn'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Sat Mar 09 2024', 'dataset_doi': '10.24432/C5JW3Z', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is randomly collected from an Iranian telecom companyâ€™s database over a period of 12 months. A total of 3150 rows of data, each representing a customer, bear information for 13 columns. The attribu

In [382]:
df = pd.concat([X,y],axis=1)
print(df.isna().any())

Call  Failure              False
Complains                  False
Subscription  Length       False
Charge  Amount             False
Seconds of Use             False
Frequency of use           False
Frequency of SMS           False
Distinct Called Numbers    False
Age Group                  False
Tariff Plan                False
Status                     False
Age                        False
Customer Value             False
Churn                      False
dtype: bool


In [383]:
df

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.640,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.520,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.020,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19,2,6697,147,92,44,2,2,1,25,721.980,0
3146,17,0,17,1,9237,177,80,42,5,1,1,55,261.210,0
3147,13,0,18,4,3157,51,38,21,3,1,1,30,280.320,0
3148,7,0,11,2,4695,46,222,12,3,1,1,30,1077.640,0


In [384]:
y["Churn"].unique()

array([0, 1])

In [385]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Call  Failure            3150 non-null   int64  
 1   Complains                3150 non-null   int64  
 2   Subscription  Length     3150 non-null   int64  
 3   Charge  Amount           3150 non-null   int64  
 4   Seconds of Use           3150 non-null   int64  
 5   Frequency of use         3150 non-null   int64  
 6   Frequency of SMS         3150 non-null   int64  
 7   Distinct Called Numbers  3150 non-null   int64  
 8   Age Group                3150 non-null   int64  
 9   Tariff Plan              3150 non-null   int64  
 10  Status                   3150 non-null   int64  
 11  Age                      3150 non-null   int64  
 12  Customer Value           3150 non-null   float64
 13  Churn                    3150 non-null   int64  
dtypes: float64(1), int64(13)

In [386]:
train_df, test_df = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=0)


In [387]:
# find best model with cross validation

models = []
best_model = None
best_model_score = 0

y_train = train_df["Churn"]
y_test = test_df["Churn"]
X_train = train_df.drop(columns=["Churn"])
X_test = test_df.drop(columns=["Churn"])
smote = SMOTE(random_state=0)
X_upsampled, y_upsampled = smote.fit_resample(X_train, y_train)

for n_estimators in range(1, 51):
    for i in range (1,6):
        model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=4) , n_estimators=n_estimators,random_state=i)
        # train best model
        model.fit(X_upsampled, y_upsampled)
        y_pred = model.predict(X_test)
        bac = balanced_accuracy_score(y_test,y_pred)
        if bac>best_model_score:
            print("best n_estimator so far: ",n_estimators)
            best_model_score = bac
            best_model = model
        
print(best_model_score)

best n_estimator so far:  1
best n_estimator so far:  2
best n_estimator so far:  4
best n_estimator so far:  5
best n_estimator so far:  6
0.9442233531141595


In [388]:
# save the model
dump(best_model, 'adaboost_model.joblib')

# ---------------------

# # load the model
# loaded_model = load('adaboost_model.joblib')

# # test the loaded model
# print(loaded_model.predict(X_test))

['adaboost_model.joblib']