In [1]:
import pandas as pd
import numpy as np

In [2]:
churn_data = pd.read_csv('train.csv')
print(churn_data.shape)

churn_data.head()

(4250, 20)


Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no


# The Target Feature

In [3]:
churn_data['churn'].value_counts() / len(churn_data)

no     0.859294
yes    0.140706
Name: churn, dtype: float64

# The Variable Types

In [4]:
numerical_features = [feature for feature in churn_data.columns if churn_data[feature].dtype != 'O']
print("Total number of numerical features: ", len(numerical_features))

print(numerical_features)

Total number of numerical features:  15
['account_length', 'number_vmail_messages', 'total_day_minutes', 'total_day_calls', 'total_day_charge', 'total_eve_minutes', 'total_eve_calls', 'total_eve_charge', 'total_night_minutes', 'total_night_calls', 'total_night_charge', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'number_customer_service_calls']


In [5]:
categorical_features = [feature for feature in churn_data.columns if feature not in numerical_features and feature != 'churn']
print("Total number of categorical features: ", len(categorical_features))

print(categorical_features)

Total number of categorical features:  4
['state', 'area_code', 'international_plan', 'voice_mail_plan']


## Adding new col: area_code_num

In [6]:
churn_data['area_code_num'] = churn_data['area_code'].apply(lambda x: int(x[-3:]))

# train-test split

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(churn_data.drop(['churn', 'state'], axis = 1),
                                                   churn_data['churn'],
                                                   test_size=0.3,
                                                   random_state=12)

# Categorical Features

In [8]:
from feature_engine.encoding import OneHotEncoder
categorical_features.remove('state')
encoder = OneHotEncoder(variables=categorical_features)
encoder.fit(X_train, y_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [9]:
print(X_train.shape)
X_train.head()

(2975, 23)


Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,...,total_intl_charge,number_customer_service_calls,area_code_num,area_code_area_code_510,area_code_area_code_415,area_code_area_code_408,international_plan_no,international_plan_yes,voice_mail_plan_no,voice_mail_plan_yes
4109,172,0,290.5,123,49.39,253.0,77,21.51,159.3,96,...,3.05,2,510,1,0,0,1,0,1,0
4170,114,23,186.2,95,31.65,258.3,129,21.96,183.9,93,...,2.11,0,415,0,1,0,1,0,0,1
2773,115,0,200.2,92,34.03,244.9,107,20.82,190.9,96,...,2.38,1,415,0,1,0,1,0,1,0
2007,101,0,217.7,118,37.01,231.7,128,19.69,185.3,128,...,0.0,3,415,0,1,0,1,0,1,0
366,135,27,273.4,141,46.48,154.0,99,13.09,245.8,112,...,3.32,1,510,1,0,0,1,0,0,1


# Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train, y_train)

# columns = X_train.columns.to_list()

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Encoding Target feature

In [11]:
def encode_target(value):
    target_dict = {'yes': 1, 'no': 0}
    
    return target_dict[value]

In [12]:
y_train.head()

4109    yes
4170     no
2773     no
2007     no
366      no
Name: churn, dtype: object

In [13]:
y_train = y_train.apply(lambda x: encode_target(x))
y_train = np.array(y_train)

In [14]:
y_test

1168    yes
3603     no
3894     no
3739     no
1653     no
       ... 
1066     no
3171     no
4187     no
2340     no
416      no
Name: churn, Length: 1275, dtype: object

In [15]:
y_test = y_test.apply(lambda x: encode_target(x))
y_test = np.array(y_test)


# Model Building

## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
logit = LogisticRegression()

logit.fit(X_train, y_train)

LogisticRegression()

In [18]:
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)

# Model Evaluation

In [19]:
from sklearn.metrics import classification_report, accuracy_score

In [20]:
print("Train Accuracy: ", accuracy_score(y_train, y_train_pred))
print("Test Accuracy: ", accuracy_score(y_test, y_test_pred))

Train Accuracy:  0.8702521008403361
Test Accuracy:  0.8525490196078431


In [22]:
print("Train report: \n", classification_report(y_train, y_train_pred))

Train report: 
               precision    recall  f1-score   support

           0       0.89      0.97      0.93      2564
           1       0.58      0.22      0.32       411

    accuracy                           0.87      2975
   macro avg       0.73      0.60      0.62      2975
weighted avg       0.84      0.87      0.84      2975



In [23]:
print("Test report: \n", classification_report(y_test, y_test_pred))

Test report: 
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      1088
           1       0.49      0.19      0.28       187

    accuracy                           0.85      1275
   macro avg       0.68      0.58      0.60      1275
weighted avg       0.82      0.85      0.82      1275



## XGClassifier

In [25]:
import xgboost as xgb

classifier = xgb.XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [26]:
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)

In [27]:
print("Train Accuracy: ", accuracy_score(y_train, y_train_pred))
print("Test Accuracy: ", accuracy_score(y_test, y_test_pred))

Train Accuracy:  1.0
Test Accuracy:  0.9615686274509804


In [28]:
print("Train report: \n", classification_report(y_train, y_train_pred))

Train report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2564
           1       1.00      1.00      1.00       411

    accuracy                           1.00      2975
   macro avg       1.00      1.00      1.00      2975
weighted avg       1.00      1.00      1.00      2975



In [29]:
print("Test report: \n", classification_report(y_test, y_test_pred))

Test report: 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1088
           1       0.97      0.76      0.85       187

    accuracy                           0.96      1275
   macro avg       0.96      0.88      0.92      1275
weighted avg       0.96      0.96      0.96      1275



# Score New Data

In [30]:
churn_data_test = pd.read_csv('test.csv')
print(churn_data.shape)

churn_data_test.head()

(4250, 21)


Unnamed: 0,id,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,1,KS,128,area_code_415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,2,AL,118,area_code_510,yes,no,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0
2,3,IA,62,area_code_415,no,no,0,120.7,70,20.52,307.2,76,26.11,203.0,99,9.14,13.1,6,3.54,4
3,4,VT,93,area_code_510,no,no,0,190.7,114,32.42,218.2,111,18.55,129.6,121,5.83,8.1,3,2.19,3
4,5,NE,174,area_code_415,no,no,0,124.3,76,21.13,277.1,112,23.55,250.7,115,11.28,15.5,5,4.19,3


## The Target Feature

In [32]:
churn_data_test = churn_data_test.drop(['id', 'state'], axis = 1)
churn_data_test['area_code_num'] = churn_data_test['area_code'].apply(lambda x: int(x[-3:]))

In [33]:
churn_data_test.head()

Unnamed: 0,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,area_code_num
0,128,area_code_415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,415
1,118,area_code_510,yes,no,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0,510
2,62,area_code_415,no,no,0,120.7,70,20.52,307.2,76,26.11,203.0,99,9.14,13.1,6,3.54,4,415
3,93,area_code_510,no,no,0,190.7,114,32.42,218.2,111,18.55,129.6,121,5.83,8.1,3,2.19,3,510
4,174,area_code_415,no,no,0,124.3,76,21.13,277.1,112,23.55,250.7,115,11.28,15.5,5,4.19,3,415


In [34]:
churn_data_test = encoder.transform(churn_data_test)

In [35]:
churn_data_test.head()

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,...,total_intl_charge,number_customer_service_calls,area_code_num,area_code_area_code_510,area_code_area_code_415,area_code_area_code_408,international_plan_no,international_plan_yes,voice_mail_plan_no,voice_mail_plan_yes
0,128,25,265.1,110,45.07,197.4,99,16.78,244.7,91,...,2.7,1,415,0,1,0,1,0,0,1
1,118,0,223.4,98,37.98,220.6,101,18.75,203.9,118,...,1.7,0,510,1,0,0,0,1,1,0
2,62,0,120.7,70,20.52,307.2,76,26.11,203.0,99,...,3.54,4,415,0,1,0,1,0,1,0
3,93,0,190.7,114,32.42,218.2,111,18.55,129.6,121,...,2.19,3,510,1,0,0,1,0,1,0
4,174,0,124.3,76,21.13,277.1,112,23.55,250.7,115,...,4.19,3,415,0,1,0,1,0,1,0


In [36]:
churn_data_test = scaler.transform(churn_data_test)

In [38]:
churn_data_test

array([[ 0.71118121,  1.27341293,  1.57924352, ..., -0.32423186,
        -1.65736278,  1.65736278],
       [ 0.45539775, -0.57476907,  0.8038017 , ...,  3.08421262,
         0.6033682 , -0.6033682 ],
       [-0.97698968, -0.57476907, -1.10597945, ..., -0.32423186,
         0.6033682 , -0.6033682 ],
       ...,
       [ 1.32506154, -0.57476907,  0.07484921, ..., -0.32423186,
         0.6033682 , -0.6033682 ],
       [-1.00256802, -0.57476907, -0.73592449, ..., -0.32423186,
         0.6033682 , -0.6033682 ],
       [ 0.22519263, -0.57476907,  0.16038955, ..., -0.32423186,
         0.6033682 , -0.6033682 ]])

## Predictions

In [39]:
new_data_preds = classifier.predict(churn_data_test)

In [44]:
new_data_preds

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
new_data_preds = pd.Series(new_data_preds)

In [46]:
new_data_preds

0      0
1      0
2      1
3      0
4      0
      ..
745    0
746    0
747    0
748    0
749    0
Length: 750, dtype: int32

## Saving results

In [47]:
sample_submission = pd.read_csv('sampleSubmission.csv')
print(sample_submission.shape)
sample_submission.head()

(750, 2)


Unnamed: 0,id,churn
0,1,yes
1,2,no
2,3,no
3,4,yes
4,5,yes


In [64]:
def decode_target(value):
    target_dict = {1: 'yes', 0: 'no'}
    
    return target_dict[value]

In [65]:
new_data_preds = new_data_preds.apply(lambda x: str(decode_target(x)))

In [67]:
new_data_preds.head()

0     no
1     no
2    yes
3     no
4     no
dtype: object

In [68]:
sample_submission['churn'] = new_data_preds

In [69]:
sample_submission.head()

Unnamed: 0,id,churn
0,1,no
1,2,no
2,3,yes
3,4,no
4,5,no


In [70]:
sample_submission.to_csv('sampleSubmission.csv', index = False)