In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

file_name = '../../data/Customer Churn Data.csv'

def pipeline(pickle = True):
    X_train, X_test, y_train, y_test = get_train_and_test_data()
    model = make_model(X_train, y_train)
    if pickle:
        pickler(model, 'model.pickle')
    return model

    
def get_train_and_test_data():
    '''
    Returns testing and training data
    '''
    data = get_data()
    return split_data(data)
    
    
def get_data():
    '''
    Gets data from datafile and does some pruning.
    Drops columns that worsen the model and agregates the charges columns (This helps the model)
    
    Returns
    -------
    Returns the data frame to be used in making the model
    '''
    df = pd.read_csv(file_name)
    
    df['international plan'] = (df['international plan'] == 'yes').astype(int)
    df['voice mail plan'] = (df['voice mail plan'] == 'yes').astype(int)

    df['total charge'] = df['total day charge'] + df['total eve charge'] + df['total intl charge'] + df['total night charge']
    df = df.drop(['total day charge', 'total eve charge', 'total intl charge', 'total night charge'], axis = 1)
    
    df = df.drop(['area code', 'phone number', 'state'], axis = 1)
    return df
    
    
def split_data(data):
    '''
    Does a train test split on the passed in with churn as the target
    
    Parameters
    ----------
    data: churn data to be split
    
    Returns
    -------
    Training predictors, test predictor, training target, test target
    '''
    target = data['churn']
    X = data.copy()
    X = X.drop(['churn'], axis = 1)
    return train_test_split(X, target, test_size = 0.30, random_state = 42)


def make_model(X_train, y_train):
    '''
    fits and returns a stacking model based on the data passed in
    '''
    estimators = [('rf', RandomForestClassifier()),
                  ('log', LogisticRegression(solver = 'liblinear')),
                  ('grad', GradientBoostingClassifier())]
    stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)
    stack.fit(X_train, y_train)
    return stack    
    

def metrics(y_true, y_pred):
    '''
    returns some metrics
    '''
    metric_dictionary = {}
    metric_dictionary['Accuracy'] = str(accuracy_score(y_true, y_pred))
    metric_dictionary['Precision'] = str(precision_score(y_true, y_pred))
    metric_dictionary['Recall'] = str(recall_score(y_true, y_pred))
    metric_dictionary['F1'] = str(f1_score(y_true, y_pred))
    metric_dictionary['confusion_matrix'] = confusion_matrix(y_true, y_pred)
    return metric_dictionary    
    
    
def pickler(model, file_name):
    '''
    turns a model into a pickle file
    '''
    output_file = open(file_name, 'wb')
    pickle.dump(model, output_file)
    output_file.close()

    
def read_pickle(file_name):
    '''
    reads a pickle file
    '''
    model_file = open(file_name, "rb")
    model = pickle.load(model_file)
    model_file.close()
    return model

In [2]:
model = pipeline(pickle = False)

In [3]:
X_train, X_test, y_train, y_test = get_train_and_test_data()

In [4]:
metrics(y_test, model.predict(X_test))

{'Accuracy': '0.984',
 'Precision': '1.0',
 'Recall': '0.8881118881118881',
 'F1': '0.9407407407407408',
 'confusion_matrix': array([[857,   0],
        [ 16, 127]])}

In [5]:
model.estimators_[1].predict_proba(scaler.transform(X_test))

NameError: name 'scaler' is not defined

In [6]:
model = read_pickle('model.pickle')

FileNotFoundError: [Errno 2] No such file or directory: 'model.pickle'

In [7]:
y_train

2016    False
1362    False
2670    False
2210     True
1846    False
        ...  
1095    False
1130    False
1294    False
860     False
3174    False
Name: churn, Length: 2333, dtype: bool

In [8]:
df = pd.read_csv(file_name)

In [9]:
(get_data().head(10).tail(1).drop(['churn'], axis = 1))

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,total charge
9,141,1,1,37,258.6,84,222.0,111,326.4,97,11.2,5,0,80.54


In [10]:
pd.DataFrame([[1,2,3]], columns = ['a','b','c'])

Unnamed: 0,a,b,c
0,1,2,3


In [11]:
X_train.head(1)

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,total charge
2016,80,0,0,0,202.4,118,260.2,67,177.4,112,9.2,5,3,66.99


In [12]:
df.head(1)['churn'].dtype

dtype('bool')

Index(['account length', 'international plan', 'voice mail plan', 'number vmail messages', 'total day minutes', 'total day calls', 'total eve minutes', 'total eve calls', 'total night minutes', 'total night calls', 'total intl minutes', 'total intl calls', 'customer service calls', 'churn', 'total charge'], dtype='object')