In [5]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [6]:

def encode_columns(data, columns_to_encode):
    # Get a list of all column names except the ones to encode
    columns_to_keep = [col for col in data.columns if col not in columns_to_encode]
    # One-hot encode selected columns
    one_hot_encoded = pd.get_dummies(data, columns=columns_to_encode)
    return one_hot_encoded

def tenure(x):
    if x<=6:
        return 1
    elif x>6 and x<=12:
        return 2
    elif x>12 and x<=18:
        return 3
    elif x>18 and x<=24:
        return 4
    elif x>24 and x<=30:
        return 5
    else:
        return 6

In [7]:
def data_preparation(path):
    # read data using path
    data = pd.read_csv(path)
    # set all columns in lower capitals 
    data.columns = data.columns.str.lower()
    # drop the id column that isn't useful
    data.drop(["customerid"], inplace = True, axis = 1)
    # replace some null values in totalcharges
    data['totalcharges'] = pd.to_numeric(data['totalcharges'],errors='coerce', downcast='float')
    data['totalcharges'].fillna(0,inplace=True)
    return data

In [8]:
path = "data/data_telco.csv"
data = data_preparation(path)
data.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.150002,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.649994,Yes


### Data split into train and test.

In [23]:

dataX = data.loc[:, data.columns != "churn"].values
dataY = data['churn']
# train is now 75% of the entire data set
df_full_train, df_test = train_test_split(data, test_size=0.2,random_state=1)



In [24]:
numerical = ['tenure','monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod'
]

In [25]:
dicts = df_full_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts)
y_train = df_full_train['churn'].values

In [26]:

C = np.logspace(-4, 4, 50)
penalty = ['l1', 'l2']
logistic_Reg = LogisticRegression()
pipe = Pipeline(steps=[('logistic_Reg', logistic_Reg)])
parameters = dict(logistic_Reg__C=C, logistic_Reg__penalty=penalty)
logreg_cv=GridSearchCV(pipe,parameters,cv=10)
logreg_cv.fit(X_train,y_train)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [27]:
print('Best Penalty:', logreg_cv.best_estimator_.get_params()['logistic_Reg__penalty'])
print('Best C:', logreg_cv.best_estimator_.get_params()['logistic_Reg__C'])
print(logreg_cv.best_estimator_.get_params()['logistic_Reg'])
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Best Penalty: l2
Best C: 0.019306977288832496
LogisticRegression(C=0.019306977288832496)
tuned hpyerparameters :(best parameters)  {'logistic_Reg__C': 0.019306977288832496, 'logistic_Reg__penalty': 'l2'}
accuracy : 0.8045819004068882


In [28]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred, X

y_pred, X_test = predict(df_test, dv, logreg_cv)
y_test = df_test.churn.values
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg_cv.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.81


In [29]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred)
auc

0.8567131961823048

The dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).

### Deploying the model

* Save the model :

In [16]:
import pickle
C = logreg_cv.best_estimator_.get_params()['logistic_Reg__C']
output_file = f"model_C={C}.bin"
output_file


'model_C=339.3221771895323.bin'

In [17]:
# write a binary file
f_out = open(output_file, "wb")
# save the model and the dictionary vectorizer (we need that in order to run the model)
pickle.dump((dv, logreg_cv), f_out)
# close the file
f_out.close()


In [18]:
# Load  the model to check ( kernel is restarted at this point )
import pickle
C = '339.3221771895323'
model_file = f"model_C={C}.bin"


In [19]:
with open(model_file, "rb") as f_in:
    dv, model = pickle.load(f_in)

In [39]:
customer = {
    "gender": "female",
    "seniorcitizen": 0,
    "partner": "yes",
    "dependents": "no",
    "phoneservice": "no",
    "multiplelines": "no_phone_service",
    "internetservice": "dsl",
    "onlinesecurity": "no",
    "deviceprotection": "no",
    "techsupport": "no",
    "streamingtv": "no",
    "streamingmovies": "no",
    "contract": "month-to-month",
    "paperlessbilling": "yes",
    "paymentmethod": "electronic_check",
    "tenure": 0,
    "monthlycharges": 0,
    "totalcharges": 29.85
}

In [40]:


# turn this customer into a feature matrix
X = dv.transform([customer])



In [41]:
# probabilty that this customer churns
model.predict_proba(X)[0,1]

0.46795550876600445