In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Let's load and Prepare our data

In [None]:
## import all relevant libraries
import seaborn as sn
from matplotlib import pyplot as plt
%matplotlib inline 


In [None]:
##lets load our dataset
df = pd.read_csv('../input/telecom-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
##lets normalize all the column names in our dataset
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

In [None]:
##lets check for column types
df.dtypes

In [None]:
##lets normalize the values in the our dataset
##lets select all columns with string values
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    
df.head()

In [None]:
##lets now change the column type of totalcharges to a numeric type
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
##lets fill the emplty values with zeros
df['totalcharges'] = df['totalcharges'].fillna(0)

df.dtypes

In [None]:
## lets change the values of predicted values in numerical values
df.churn = (df.churn == 'yes').astype(int)

### Split our data in train, validation, and test sets. 

In [None]:
##lets import all relevarant libraries
from sklearn.model_selection import train_test_split

##lets alocate 20% of our data to the test set
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

## lets allocate 33 percent to our val set and the rest to train set
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state =11)

##lets select our target data
y_train = df_train.churn.values
y_val = df_val.churn.values

## lets delete the target data from our training data
del df_train['churn']
del df_val['churn']

In [None]:
## lets create a list of columns with categorical data
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod' ]

## lets create a list of columns with numerical data
numerical = ['tenure', 'monthlycharges',  'totalcharges']

In [None]:
## check for unique values in each columns 
df_train_full[categorical].nunique()

### Simple feature:: by converting all our categorical vals to numerical vals

In [None]:
##lets import all relevant libraries
from sklearn.feature_extraction import DictVectorizer

## converting our dataframe into a list of dictionaries
train_dict = df_train[categorical + numerical].to_dict(orient='records')

## lets instantiate the 
dv = DictVectorizer(sparse=False)

##
dv.fit(train_dict)

##
X_train = dv.transform(train_dict)

### Train our base model

In [None]:
##import all relevant libraries
from sklearn.linear_model import LogisticRegression

## create an instante
model = LogisticRegression(solver='liblinear', random_state=1)

##train our model
model.fit(X_train, y_train)

### Evaluate our models performance 

In [None]:
##
val_dict = df_val[categorical + numerical].to_dict(orient='records')

#dv.fit(val_di

X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:,1]

### Small subset

In [None]:
small_subset = ['contract', 'tenure', 'totalcharges']

train_dict_small_subset = df_train[small_subset].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

dv.fit(train_dict_small_subset)

X_small_train = dv.transform(train_dict_small_subset)

In [None]:
##lets train
from sklearn.linear_model import LogisticRegression

model_small = LogisticRegression(solver='liblinear', random_state=1)

model_small.fit(X_small_train, y_train)

In [None]:
##lets evaluate our model
val_dict_small_subset = df_val[small_subset].to_dict(orient='records')

#dv.fit(val_dict_small_subset)

X_small_val = dv.transform(val_dict_small_subset)

y_pred_small = model_small.predict_proba(X_small_val)[:,1]

### Accuracy

In [None]:
##get the predictions from our model
y_pred = model.predict_proba(X_val)[:,1]

## make hard predictions
churn = y_pred >= 0.5

## computes the accuracy
(y_val == churn).mean()


In [None]:
from sklearn.metrics import accuracy_score

##lets create an area with different thresholds
thresholds = np.linspace(0,1,11)

##lets loop over our threshold values
for t in thresholds:
    ##lets make the hard predictions
    churn = y_pred >= t
    ##lets use accuracy_score from scikit learn
    acc = accuracy_score(y_val, churn)
    ##lets print the thresholds and the accuracy values to std ouput
    print('%0.2f %0.3f' % (t, acc))

### Lets visualize our accuracy score with respect to each threshold value

In [None]:
thresholds = np.linspace(0,1,21)

##
accuracies = []

##
for t in thresholds:
    acc = accuracy_score(y_val, y_pred >= t)
    accuracies.append(acc)
    
plt.plot(thresholds, accuracies)

In [None]:
## lets also compute the accuracy on our validation set
val_dict_small = df_val[small_subset].to_dict(orient='records')

X_small_val = dv.transform(val_dict_small)

y_pred_small = model_small.predict_proba(X_small_val)[:,1]

churn_small = y_pred_small >= 0.5
accuracy_score(y_val, churn_small)

### Dummy baseline

In [None]:
size_val = len(y_val)
baseline = np.repeat(False, size_val)
baseline

In [None]:
accuracy_score(baseline, y_val)

### Confusioin table 
This refers to a table that concisely represents every possible outcome of our models predictions.
Namely; True Positve, True Negative, False Positive, and False Negative.

In [None]:
##lets set our predictions at the threshold value of 0.5 
t = 0.5 
predicted_churn = (y_pred >= t)
predicted_no_churn = (y_pred < t)

## lets get the actual targets 
actual_churn = (y_val == 1)
actual_no_churn = (y_val == 0)

## lets computes true positives 
true_positive = (predicted_churn & actual_churn).sum()
false_positive = (predicted_churn & actual_no_churn).sum()

##lets computes true negatives
true_negative = (predicted_no_churn & actual_no_churn).sum()
false_negative = (predicted_no_churn & actual_churn).sum()

In [None]:
##lets put all the values together
confusion_table = np.array(
    [[true_negative, false_positive],
    [false_negative, true_positive]]
)

confusion_table

In [None]:
confusion_table / confusion_table.sum()

In [None]:
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
precision, recall 

### Evaluating a model at multiple thresholds 

In [None]:
## lets create a list where we'll keep the results
scores = []

## lets create an array with different threshold values

## lets loop through them...
thresholds = np.linspace(0, 1, 101)

## computes the confusion table for predictions at each threshold
for t in thresholds:
    tp = ((y_pred >= t) & (y_val == 1)).sum()
    fp = ((y_pred >= t) & (y_val == 0)).sum()
    fn = ((y_pred < t) & (y_val == 1)).sum()
    tn = ((y_pred < t) & (y_val == 0)).sum()
    
    ##lets append the resutls to the score list
    scores.append((t, tp, fp, fn, tn))

    
## lets turn the list into a pandas dataframe 
df_scores = pd.DataFrame(scores)

##assigns names to the columns of the dataframe 
df_scores.columns = ['threshold', 'tp', 'fp', 'fn', 'tn']

In [None]:
df_scores[::10]

In [None]:
##lets compute the TPR and FPR scores
df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)

In [None]:
df_scores[::10]

In [None]:
plt.plot(df_scores.threshold, df_scores.tpr, linestyle='solid' , label='TPR')
plt.plot(df_scores.threshold, df_scores.fpr, linestyle='dashed',  label='FPR')
plt.legend()

### Random baseline model  
This model outputs a random score between 0 and 1, regardless of the input. 

In [None]:
##lets create a function to cal TRP and FPR at diff thresholds
def tpr_fpr_dataframe(y_val, y_pred):
    """
    Defines a function that takes in actual and predicted values
    """
    ##empty list of scores
    scores = []
    
    ##create an numpy array of threholds value
    thresholds = np.linspace(0, 1, 101)
    
    ##cal the confusion table for different thresholds
    
    for t in thresholds:
        tp = ((y_pred >= t) & (y_val == 1)).sum()
        fp = ((y_pred >= t) & (y_val == 0)).sum()
        fn = ((y_pred < t) & (y_val == 1)).sum()
        tn = ((y_pred < t) & (y_val == 0)).sum()
        scores.append((t, tp, fp, fn, tn))
     
    ##lets converts the confusion table numbers to a dataframe
    df_scores = pd.DataFrame(scores)
    df_scores.columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
    
    ##cal TPR and FPR using the confusion table numbers
    df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
    df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)
    
    return df_scores

In [None]:
##lets fix the random seet 
np.random.seed(1)

##generate an array with random numbers b/n 0 and 1
y_rand = np.random.uniform(0, 1, size=len(y_val))

##lets use this function to cal the TPR and FPR
df_rand = tpr_fpr_dataframe(y_val, y_rand)

##display 
df_rand[::10]


In [None]:
plt.figure(figsize=(6,4))

plt.plot(df_rand.threshold, df_rand.tpr, label='TPR')
plt.plot(df_rand.threshold, df_rand.fpr, label='FPR')
plt.legend()

plt.xticks(np.linspace(0,1,11))
plt.yticks(np.linspace(0,1,11))

plt.xlabel('Thresholds')
plt.title('TPR and FPR for the random model')

plt.show()

## The Ideal model 
The Ideal model always make a correct prediction


In [None]:
##lets calculates the number of neg and posi examples in the dataset. 
num_neg = (y_val == 0).sum()
num_pos = (y_val == 1).sum()

##lets generates an array that first repeats 0s num_neg num of times, 
##followed by 1s repeated num_pos number of times 
y_ideal = np.repeat([0,1], [num_neg, num_pos])
y_pred_ideal = np.linspace(0, 1, num_neg + num_pos)

##computes the TPR and FPR curves for classifier
df_ideal = tpr_fpr_dataframe(y_ideal, y_pred_ideal)



In [None]:
plt.figure(figsize=(6,4))

plt.plot(df_ideal.threshold, df_ideal.tpr, label='TPR')
plt.plot(df_ideal.threshold, df_ideal.fpr, label='FPR')
plt.legend()

plt.xticks(np.linspace(0,1,11))
plt.yticks(np.linspace(0, 1, 11))

plt.vlines(1 - y_val.mean(), -1, 2, linewidth=0.5, linestyle='dashed', color='grey')
plt.ylim(-0.03, 1.03)

plt.xlabel('Thresholds')
plt.title('TPR and FPR for the ideal model')
plt.show()

### ROC curve

In [None]:
##lets make the plot square
plt.figure(figsize=(5,5))

plt.plot(df_scores.fpr, df_scores.tpr, color='black', label='Model')
plt.plot(df_rand.fpr, df_rand.tpr, color='black', lw=1, linestyle='dashed', alpha=0.5, label='Random')
plt.plot(df_ideal.fpr, df_ideal.tpr, color='black', lw=0.5, linestyle='solid', alpha=0.5, label='Ideal')
plt.legend()

plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.title('ROC curve')


In [None]:
plt.figure(figsize=(5, 5))

plt.plot(df_scores.fpr, df_scores.tpr, color='black')
plt.plot([0, 1], [0, 1], color='black', lw=0.7, linestyle='dashed', alpha=0.5)

plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.title('ROC curve')

# plt.savefig('04_roc_curve.svg')

plt.show()

### Plotting ROC Curve using Scikit learn

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_val, y_pred)

plt.figure(figsize=(5,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1])

In [None]:
##Lets plot the ROC curve of small and large models 
fpr_large, tpr_large, _ = roc_curve(y_val, y_pred)
fpr_small, tpr_small, _ = roc_curve(y_val, y_pred_small)

plt.figure(figsize=(5,5))

plt.plot(fpr_large, tpr_large, label='Large')
plt.plot(fpr_small, tpr_small, label='Small')
plt.plot([0,1], [0,1])
plt.legend()

### Area under the ROC (AUC) 

In [None]:
from sklearn.metrics import auc

auc(df_scores.fpr, df_scores.tpr)

In [None]:
##lets cal auc for the small model


In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

### Let's put all the code for training into a train function, which first converts the data into a one-hot encoding representation and then trains the model.

In [None]:
def train(df, y):
    ##applies one-hot encoding
    cat = df[categorical + numerical].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(cat)
    
    X = dv.transform(cat)
    ##trains the model
    model = LogisticRegression(solver='liblinear')
    
    model.fit(X, y)
    
    return dv, model

Likewise, we also put the prediction logic into a predict function. 
This function takes in a dataframe with customers,
the vectorizer we “trained” previously — for doing onehot encoding — and the model. 
Then we apply the vectorizer to the dataframe, get a matrix, 
and finally apply the model to the matrix to get predictions

In [None]:
def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='records')
    
    X = dv.transform(cat)
    y_pred = model.predict_proba(X)[:,1]
    
    return y_pred

Let's use these functions for implementing K-fold cross-validation.


In [None]:
##lets import the KFold class
from sklearn.model_selection import KFold

## lets use it to split the data into 10 parts
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

## creates a list for storing the resutls 
aucs = []

##lets iterate over the 10 diff splits of the data
for train_idx, val_idx in kfold.split(df_train_full):
    ##lets splits the data into train and validation sets
    df_train = df_train_full.iloc[train_idx]
    df_val = df_train_full.iloc[val_idx]
    
    y_train = df_train.churn.values
    y_val = df_val.churn.values 
    
    #trains the model and makes predictions
    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)
    
    ## lets evaluate the quality of the train on the validation data using AUC
    auc = roc_auc_score(y_val, y_pred)
    aucs.append(auc)
    

In [None]:
print('auc = %0.3f ± %0.3f' % (np.mean(aucs), np.std(aucs)))

### Finding the best parameters
Let’s select our cross-validation procedure for selecting the best parameter C. For
that, we first adjust the train function to take in an additional parameter

In [None]:
def train(df, y, C):
    cat = df[categorical + numerical].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    
    dv.fit(cat)
    
    X = dv.transform(cat)
    
    ##lets use the parameter during training
    model = LogisticRegression(solver='liblinear', C=C)
    model.fit(X, y)
    
    return dv, model

 let’s find the best parameter C. The idea is simple:
 Loop over different values of C.
For each C, run cross-validation and record the mean AUC across all folds as
well as the standard deviation

In [None]:
##Tuning the model: selecting the best parameter C usinng cross-validation
nfolds = 5
kfolds = KFold(n_splits=nfolds, shuffle=True, random_state=1)

for C in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    aucs = []
    
    for train_idx, val_idx in kfold.split(df_train_full):
        df_train = df_train_full.iloc[train_idx]
        df_val = df_train_full.iloc[val_idx]
        
        y_train = df_train.churn.values
        y_val = df_val.churn.values
        
        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)
        
        auc = roc_auc_score(y_val, y_pred)
        aucs.append(auc)
        
    print('C=%s, auc = %0.3f += %0.3f' % (C, np.mean(aucs), np.std(aucs)))

Let's train the model on the entire train and validation
datasets and apply it to the test dataset to verify it indeed works well.
Let’s use our train and predict functions for that:

In [None]:
y_train = df_train_full.churn.values
y_test = df_test.churn.values

##lets train the modelon the full training dataset
dv, model = train(df_train_full, y_train, C=0.5)
##applies it to the test dataset
y_pred = predict(df_test, dv, model)

## evaluates the predictions on the test data
auc = roc_auc_score(y_test, y_pred)
print('auc = %.3f' % auc)

In [None]:
customer = {
'customerid': '8879-zkjof',
'gender': 'female',
'seniorcitizen': 0,
'partner': 'no',
'dependents': 'no',
'tenure': 41,
'phoneservice': 'yes',
'multiplelines': 'no',
'internetservice': 'dsl',
'onlinesecurity': 'yes',
'onlinebackup': 'no',
'deviceprotection': 'yes',
'techsupport': 'yes',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'one_year',
'paperlessbilling': 'yes',
'paymentmethod': 'bank_transfer_(automatic)',
'monthlycharges': 79.85,
'totalcharges': 3320.75,
}

In [None]:
df = pd.DataFrame([customer])
y_pred = predict(df, dv, model)
y_pred[0]

In [None]:
def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='rows')
    X = dv.transform(cat)
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred 

In [None]:
## lets create a function to predict the prob of churn for a single customer only
def predict_single(customer, dv, model):
    ## vectorizes the customer: creates the matri
    X = dv.transform([customer])
    ## applies the model to this matrix 
    y_pred = model.predict_proba(X)[:,1]
    ## lets return the first element of the result
    return y_pred[0]

In [None]:
##
predict_single(customer, dv, model)