# Data Preparation

In [121]:
# import libraries

import pandas as pd
import numpy as np

In [161]:
# Parameters
C=1
max_iter = 500
n_splits = 3


output_file = f'model_C={C}.bin'

In [122]:
## Data Preparation
df = pd.read_excel('E Commerce Dataset.xlsx', 'E Comm')
print(df.head())

print(df.shape)

   CustomerID  Churn  Tenure PreferredLoginDevice  CityTier  WarehouseToHome  \
0       50001      1     4.0         Mobile Phone         3              6.0   
1       50002      1     NaN                Phone         1              8.0   
2       50003      1     NaN                Phone         1             30.0   
3       50004      1     0.0                Phone         3             15.0   
4       50005      1     0.0                Phone         1             12.0   

  PreferredPaymentMode  Gender  HourSpendOnApp  NumberOfDeviceRegistered  \
0           Debit Card  Female             3.0                         3   
1                  UPI    Male             3.0                         4   
2           Debit Card    Male             2.0                         4   
3           Debit Card    Male             2.0                         4   
4                   CC    Male             NaN                         3   

     PreferedOrderCat  SatisfactionScore MaritalStatus  Number

In [123]:
df.columns  = df.columns.str.lower()
print(df.isnull().sum())

customerid                       0
churn                            0
tenure                         264
preferredlogindevice             0
citytier                         0
warehousetohome                251
preferredpaymentmode             0
gender                           0
hourspendonapp                 255
numberofdeviceregistered         0
preferedordercat                 0
satisfactionscore                0
maritalstatus                    0
numberofaddress                  0
complain                         0
orderamounthikefromlastyear    265
couponused                     256
ordercount                     258
daysincelastorder              307
cashbackamount                   0
dtype: int64


In [124]:
# Calculate the percentage of missing values for each variable
missing_values = df.isnull().mean() * 100

# Print the results
print("Percentage of missing values:")
print(missing_values)

# Fill missing values with mean
df.fillna(df.mean(), inplace=True)

Percentage of missing values:
customerid                     0.000000
churn                          0.000000
tenure                         4.689165
preferredlogindevice           0.000000
citytier                       0.000000
warehousetohome                4.458259
preferredpaymentmode           0.000000
gender                         0.000000
hourspendonapp                 4.529307
numberofdeviceregistered       0.000000
preferedordercat               0.000000
satisfactionscore              0.000000
maritalstatus                  0.000000
numberofaddress                0.000000
complain                       0.000000
orderamounthikefromlastyear    4.706927
couponused                     4.547069
ordercount                     4.582593
daysincelastorder              5.452931
cashbackamount                 0.000000
dtype: float64


  df.fillna(df.mean(), inplace=True)


## Spliting data into train, val and test sets

In [147]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
full_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Split the train data into train and validation sets
train_data, val_data = train_test_split(full_data, test_size=0.25, random_state=42)

# Print the sizes of the train, validation, and test sets
print("Full data size:", len(full_data))
print("Train data size:", len(train_data))
print("Validation data size:", len(val_data))
print("Test data size:", len(test_data))


X_train = train_data.reset_index(drop = True)
X_val = val_data.reset_index(drop = True)
X_test = test_data.reset_index(drop = True)

# assign the target variable 
y_train = train_data.churn.values
y_val = val_data.churn.values
y_test = test_data.churn.values
print(len(y_train)), print(len(y_val)), print(len(y_test))

# delete churn variable from df_train, val and test dataframe
del X_train['churn']
del X_val['churn']
del X_test['churn']

Full data size: 4504
Train data size: 3378
Validation data size: 1126
Test data size: 1126
3378
1126
1126


## Modeling and Evaluation

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import roc_auc_score

In [127]:
# define the logistic regression model
model = LogisticRegression()

In [133]:
# Define the parameter grid for hyperparameter tuning
params = {'C': [0.01, 0.1, 0.5, 1, 10], 'max_iter': [50, 100, 500, 1000]}

In [137]:
# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid=params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [138]:
# Print the best hyperparameters and their scores on the validation set
print('Best hyperparameters:', grid_search.best_params_)
print('Validation set score:', grid_search.best_score_)

Best hyperparameters: {'C': 1, 'max_iter': 500}
Validation set score: 0.852873986412448


In [144]:
# fit the model
lr = LogisticRegression(**grid_search.best_params_)
lr.fit(X_train, y_train)

# Encoding data
dicts = X_val.to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)
X_val = dv.fit_transform(dicts)

# Evaluate the model on the validation set using the best hyperparameters
val_preds = lr.predict(X_val)
val_acc = roc_auc_score(y_val, val_preds)
print("Validation accuracy:", val_acc)

Validation accuracy: 0.8756660746003553


##### Cross validation

In [156]:
# Apply k-fold cross-validation to the model

kfold = KFold(n_splits=5)
fold_accs = []
for train_index, val_index in kfold.split(full_data):
    train_data = full_data.iloc[train_index]
    val_data = full_data.iloc[val_index]

    y_train = train_data.churn.values
    y_val = val_data.churn.values

    dicts = train_data.to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)
    X_train = dv.fit_transform(dicts)
    lr = LogisticRegression(**grid_search.best_params_)
    lr.fit(X_train, y_train)

    dicts = val_data.to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)
    X_val = dv.fit_transform(dicts)

    val_preds = lr.predict(X_val)
    val_acc = roc_auc_score(y_val, val_preds)
    fold_accs.append(val_acc)

    print("Validation accuracy:", val_acc)

    print('validation results:')
    print('C=%s %.3f +- %.3f' % (grid_search.best_params_, np.mean(fold_accs), np.std(fold_accs)))



Validation accuracy: 0.646508639154648
validation results:
C={'C': 1, 'max_iter': 500} 0.647 +- 0.000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation accuracy: 0.9402777777777778
validation results:
C={'C': 1, 'max_iter': 500} 0.793 +- 0.147


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation accuracy: 0.9896033112039998
validation results:
C={'C': 1, 'max_iter': 500} 0.859 +- 0.151


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation accuracy: 0.9482964983577352
validation results:
C={'C': 1, 'max_iter': 500} 0.881 +- 0.137
Validation accuracy: 0.9861804995970991
validation results:
C={'C': 1, 'max_iter': 500} 0.902 +- 0.129


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [157]:
# training the model
print('Training the Final model')
dicts = train_data.to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(dicts)
lr = LogisticRegression(**grid_search.best_params_)
lr.fit(X_train, y_train)

Training the Final model


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [159]:
# Evaluate the performance of the model on the test set
dicts = test_data.to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)
X_test = dv.fit_transform(dicts)

y_pred = lr.predict(X_test)
test_accuracy = roc_auc_score(y_test, y_pred)
print("Test accuracy:", test_accuracy)

Test accuracy: 0.9595054140218858


## Save model

In [173]:
import pickle

# Save the model to a file using pickle
with open(output_file, 'wb') as f_out:
  pickle.dump((lr), f_out)

print(f'the model is saved to {output_file}')

the model is saved to model_C=1.bin


## Make prediction on new data

In [174]:
import pickle
# Load the saved model from the file
model_file = 'model_C=1.bin'

with open(model_file, 'rb') as f_in:
  lr = pickle.load( f_in)

In [176]:
customer ={
    'customerid':'90010',
    'tenure':'0.0',
    'preferredlogindevice':'phone',
    'citytier':'5',
    'warehousetohome':'10.0',
    'preferredpaymentmode':'credit_card',
    'gender':'female',
    'hourspendonapp':'4.0',
    'numberofdeviceregistered':'2',
    'preferedordercat':'laptop_&_accessory',
    'satisfactionscore':'3',
    'maritalstatus':'single',
    'numberofaddress':'12',
    'complain':'0',
    'orderamounthikefromlastyear':'15.0',
    'couponused':'0.0',
    'ordercount':'0.0',
    'daysincelastorder':'0.0',
    'cashbackamount':'100.07'
}

In [182]:
# Use the model to make predictions on new data
x =dv.transform([customer])
y_pred = lr.predict_proba(x)[0, 1]
churn_pred = lr.predict(x)

# Print the prediction
print('input', customer)
print('churn probability', y_pred)
print('churn score', churn_pred)

input {'customerid': '90010', 'tenure': '0.0', 'preferredlogindevice': 'phone', 'citytier': '5', 'warehousetohome': '10.0', 'preferredpaymentmode': 'credit_card', 'gender': 'female', 'hourspendonapp': '4.0', 'numberofdeviceregistered': '2', 'preferedordercat': 'laptop_&_accessory', 'satisfactionscore': '3', 'maritalstatus': 'single', 'numberofaddress': '12', 'complain': '0', 'orderamounthikefromlastyear': '15.0', 'couponused': '0.0', 'ordercount': '0.0', 'daysincelastorder': '0.0', 'cashbackamount': '100.07'}
churn probability 0.5041751491924896
churn score [1]
