In [106]:
# The usual modular suspects
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

# Of Mice & Machine Learning Mavericks
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, mutual_info_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score

import env
import os
from acquire import get_telco_data
from prepare import prep_telco_data

import warnings
warnings.filterwarnings('ignore')

In [94]:
df = get_telco_data()

Using cached csv


In [95]:
train, validate, test = prep_telco_data(df)

In [80]:
train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,enc_monthlycharges_group_76-100,enc_monthlycharges_group_>100,enc_totalcharges_group_0-2k,enc_totalcharges_group_2k-4k,enc_totalcharges_group_>6k,enc_tenure_months_1-6,enc_tenure_months_13-18,enc_tenure_months_7-12,monthly_charges,total_charges
5919,Female,0,No,No,58,Yes,Yes,Yes,Yes,Yes,...,0,0,0,0,0,0,0,0,-0.787739,-0.302765
1915,Male,0,No,Yes,71,Yes,Yes,Yes,Yes,No,...,1,0,0,0,1,0,0,0,-1.472492,-0.973414
5054,Female,0,Yes,Yes,35,Yes,Yes,No internet service,No internet service,No internet service,...,0,0,1,0,0,0,0,0,1.003921,0.583382
2355,Male,0,Yes,Yes,1,Yes,No,No internet service,No internet service,No internet service,...,0,0,1,0,0,1,0,0,1.306409,1.2582
6279,Male,1,No,No,20,Yes,No,Yes,No,No,...,0,0,1,0,0,0,0,0,1.475935,2.394832


In [82]:
train.churn.value_counts(), train.churn.value_counts(normalize = True)

(No     2891
 Yes    1046
 Name: churn, dtype: int64,
 No     0.734315
 Yes    0.265685
 Name: churn, dtype: float64)

In [83]:
train.churn.value_counts()
# just wanted to make sure the encoding went properly.
# At present, I have no idea whether or not keeping both versions of churn will prove problematic. 

No     2891
Yes    1046
Name: churn, dtype: int64

In [117]:
# Establish our baseline.  The rate at which the assumption of the majority class matches the real values.
# If a model does not perform better than this, it would not be wise to deploy.

#baseline = (y_train.value_counts().idxmax() == y_train).mean()
#baseline

baseline_accuracy = (train.churn == "No").mean()
round(baseline_accuracy, 2)
baseline = pd.Series(['No'] * train.shape[0])
baseline.value_counts()

No    3937
dtype: int64

In [85]:
train.dtypes

gender                              object
senior_citizen                       int64
partner                             object
dependents                          object
tenure                               int64
phone_service                       object
multiple_lines                      object
online_security                     object
online_backup                       object
device_protection                   object
tech_support                        object
streaming_tv                        object
streaming_movies                    object
paperless_billing                   object
churn                               object
contract_type                       object
internet_service_type               object
payment_type                        object
gender_encoded                       int64
partner_encoded                      int64
dependents_encoded                   int64
phone_service_encoded                int64
paperless_billing_encoded            int64
churn_encod

The vibe so far: 
The most likely drivers of churn are customers who lack a contract, pay via electronic check, utilize fiber-optic services, lack additional services as internet users, and are in their first year with telco. Let's test these assumptions statistically. 

In [86]:
def central_limit_theorem_test(*args, n_clt: int = 30) -> bool:
    '''
    Given two or more subgroups from a dataset, determines whether or not we have large enough sample
    sizes to use the central limit theorem to assume normal distribution.
    '''

    sample_sizes = [arg.size for arg in args]
    return min(sample_sizes) >= n_clt

def two_sample_ttest(
    sample1: pd.core.series.Series,
    sample2: pd.core.series.Series,
    alpha: float = 0.05,
    n_clt: int = 30,
    alternative: str = 'two-sided'
) -> None:
    '''
    Given two independent samples from a dataset, conducts a two sample t-test to compare means and outputs
    the relevant information to the console.
    Parameters
    ----------
    sample1 : Pandas Series
        A pandas series containing an independent sample from a dataset. This sample should be independent
        from sample2.
    sample2 : Pandas Series
        A pandas series containing an independent sample from a dataset. This sample should be independent
        from sample1.
    alpha : float, default 0.05
        The alpha value (derived from the confidence level) to use when determining whether or not to
        reject the null hypothesis.
    n_clt : int, default 3
        The minimum sample size required to use a parametric test. This parameter is used to determine
        if the central limit theorem can be used to assume a normal distribution of data. If the sample
        sizes are less than n_clt a non-parametric test will be used.
    alternative : str, default 'two-sided'
        The type of two sample t-test to perform. Possible values are 'two-sided', 'less', or 'greater',
        where 'less' and 'greater' are one tail t-tests and 'two-sided' is a two tail t-test.
    Returns
    -------
    None : Nothing is returned by this function. All relevant information is printed to the console.
    Examples
    --------
    >>> import stats_util as su
    >>> su.two_sample_ttest(sample1, sample2)
    >>> su.two_sample_ttest(sample1, sample2, alpha = 0.01, n_clt = 50)
    >>> su.two_sample_ttest(sample1, sample2, alternative = 'less')
    '''

    # Are the samples large enough to assume normal distribution?
    normal_dist = central_limit_theorem_test(sample1, sample2, n_clt = n_clt)
    print(f'Samples contain more than {n_clt} observations: {normal_dist}')

    # if our samples are normally distributed use a parametric test
    if normal_dist:
        # Do the subgroups have equal variance?
        equal_var = equal_var_test(sample1, sample2, alpha = alpha)
        print(f'Samples have equal variances: {equal_var}')
        print(f'Using parametric test...')
        f, p = stats.ttest_ind(sample1, sample2, equal_var = equal_var, alternative = alternative)

    # otherwise use a non-parametric test
    else:
        print(f'Using non-parametric test...')
        f, p = stats.mannwhitneyu(sample1, sample2, alternative = alternative)

    evaluate_hypothesis(p, alpha)
    
def equal_var_test(*args, alpha: float = 0.05) -> bool:
    '''
    Given two or more subgroups from a dataset, conducts a test of equal variance and returns whether or
    not p is less than alpha.
    '''

    f, p = stats.levene(*args)
    return evaluate_hypothesis(p, alpha, output = False)   

H0: Whether a customer is held on contract is not associated with their likelihood of churning.
Ha: There is a direct relationship between being on contract versus paying month to month.

In [87]:
def chi2_test(data_for_category1, data_for_category2, alpha=.05):

    '''
    Given two subgroups from a dataset, conducts a chi-squared test for independence and outputs 
    the relevant information to the console. 
    Utilizes the method provided in the Codeup curriculum for conducting chi-squared test using
    scipy and pandas. 
    '''
    
    # create dataframe of observed values
    observed = pd.crosstab(data_for_category1, data_for_category2)
    
    # conduct test using scipy.stats.chi2_contingency() test
    chi2, p, degf, expected = stats.chi2_contingency(observed)
    
    # round the expected values
    expected = expected.round(1)
    
    # output
    print('Observed\n')
    print(observed.values)
    print('---\nExpected\n')
    print(expected)
    print('---\n')
    print(f'chi^2 = {chi2:.4f}')
    print(f'p     = {p:.4f}')
    
    # evaluate the hypothesis against the established alpha value
    evaluate_hypothesis(p, alpha)
    
def evaluate_hypothesis(p: float, alpha: float = 0.05, output: bool = True) -> bool:
    '''
    Compare the p value to the established alpha value to determine if the null hypothesis
    should be rejected or not.
    '''

    if p < alpha:
        if output:
            print('\nReject H0')
        return False
    else: 
        if output:
            print('\nFail to Reject H0')
        return True

In [88]:
chi2_test(train.churn, train.contract_type)

Observed

[[1238  726  927]
 [ 927   89   30]]
---
Expected

[[1589.8  598.5  702.7]
 [ 575.2  216.5  254.3]]
---

chi^2 = 664.6572
p     = 0.0000

Reject H0


In [12]:
chi2_test(train.churn, train.payment_type)

Observed

[[713 750 708 720]
 [149 145 581 171]]
---
Expected

[[633.  657.2 946.5 654.3]
 [229.  237.8 342.5 236.7]]
---

chi^2 = 338.4852
p     = 0.0000

Reject H0


In [13]:
chi2_test(train.churn, train.internet_service_type)

Observed

[[1101  987  803]
 [ 247  735   64]]
---
Expected

[[ 989.9 1264.5  636.7]
 [ 358.1  457.5  230.3]]
---

chi^2 = 439.7661
p     = 0.0000

Reject H0


H0: Customers that churn do not have less tenure on average than customers that don't churn. 
Ha: Customers that churn have less tenure on average than customers that don't churn.

In [21]:
churned_customers = train[train.churn == 'Yes']
current_customers = train[train.churn == 'No']

two_sample_ttest(churned_customers.tenure, current_customers.tenure, alternative = 'less')

Samples contain more than 30 observations: True
Samples have equal variances: False
Using parametric test...

Reject H0


In [27]:
chi2_test(train.churn, train.online_security)

Observed

[[1099  803  989]
 [ 819   64  163]]
---
Expected

[[1408.4  636.7  845.9]
 [ 509.6  230.3  306.1]]
---

chi^2 = 510.5206
p     = 0.0000

Reject H0


In [28]:
chi2_test(train.churn, train.tech_support)

Observed

[[1126  803  962]
 [ 805   64  177]]
---
Expected

[[1418.   636.7  836.4]
 [ 513.   230.3  302.6]]
---

chi^2 = 460.8719
p     = 0.0000

Reject H0


In [29]:
chi2_test(train.churn, train.online_backup)

Observed

[[1023  803 1065]
 [ 674   64  308]]
---
Expected

[[1246.1  636.7 1008.2]
 [ 450.9  230.3  364.8]]
---

chi^2 = 326.0154
p     = 0.0000

Reject H0


In [91]:
train.dtypes

gender                              object
senior_citizen                       int64
partner                             object
dependents                          object
tenure                               int64
phone_service                       object
multiple_lines                      object
online_security                     object
online_backup                       object
device_protection                   object
tech_support                        object
streaming_tv                        object
streaming_movies                    object
paperless_billing                   object
churn                               object
contract_type                       object
internet_service_type               object
payment_type                        object
gender_encoded                       int64
partner_encoded                      int64
dependents_encoded                   int64
phone_service_encoded                int64
paperless_billing_encoded            int64
churn_encod

In [100]:
# Modeling
#switching to churn encoded from this point on. 
features = [
    'monthly', 'two_year_contract', 'fiber_optic', 'electronic_check',
    'enc_tenure_months_1-6', 'enc_tenure_months_7-12', 'enc_tenure_months_13-18',
    'auto_credit_card', 'auto_bank_transfer', 'no_internet',
    'online_security_No', 'online_security_Yes', 'online_backup_No', 'online_backup_Yes',
    'device_protection_No', 'device_protection_Yes', 'tech_support_No', 'tech_support_Yes'   
]

X_train, y_train = train[features], train.churn
X_validate, y_validate = validate[features], validate.churn

In [101]:
# Decision Tree

# We'll try a max_depth of 5 so we can get good results without overfitting
model_1 = DecisionTreeClassifier(max_depth = 5, random_state = 123)
model_1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, random_state=123)

In [130]:
model_1_2 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 6, min_samples_leaf = 1, min_samples_split = 30)
model_1_2.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_split=30)

In [None]:
DecisionTreeClassifier()

In [131]:
y_pred_model_1_2 = model_1_2.predict(X_train)

In [102]:
# Let's see the predictions our model makes
y_pred_model_1 = model_1.predict(X_train)
pd.Series(y_pred_model_1).value_counts()

No     3272
Yes     665
dtype: int64

In [132]:
measure_model_performance(y_train, baseline, y_pred_model_1_2, positive_label = 'Yes')

Unnamed: 0_level_0,accuracy,precision,recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.734315,0.0,0.0
1,0.806198,0.653304,0.576482


In [104]:
def measure_model_performance(y_true: pd.core.series.Series, *y_pred, positive_label = 1, labels = None) -> pd.core.frame.DataFrame:
    '''
        Returns a dataframe containing the accuracy, precision,
        and recall scores for the model predictions provided.
        Parameters
        ----------
        y_true: Series
            A pandas series containing the true values for the
            target variable being predicted.
        *y_pred: Series or Array
            One or more pandas series or numpy arrays containing
            the predictions for the target variable.
        positive_label: int or string, default 1
            The positive value for the target variable.
        labels: list of strings, default None
            The labels to use as the name for each model. If not
            provided the default will be a numeric index starting
            from 0.
        Returns
        -------
        DataFrame: A pandas dataframe containing the accuracy,
            precision, and recall scores for each set of predictions
            provided in y_pred.
    '''

    scores = []
    
    for index, predictions in enumerate(y_pred):
        scores.append({
            'model' : index if not labels else labels[index],
            'accuracy' : accuracy_score(y_true, predictions),
            'precision' : precision_score(y_true, predictions, pos_label = positive_label, zero_division = 0),
            'recall' : recall_score(y_true, predictions, pos_label = positive_label, zero_division = 0)
        })
        
    df = pd.DataFrame(scores)
    return df.set_index('model')

In [107]:
measure_model_performance(y_train, baseline, y_pred_model_1, positive_label = 'Yes')

Unnamed: 0_level_0,accuracy,precision,recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.734315,0.0,0.0
1,0.79807,0.688722,0.437859


In [109]:
# Random Forest

# We'll try a max_depth of five
model_2 = RandomForestClassifier(max_depth = 5, random_state = 123)
model_2.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, random_state=123)

In [110]:
# Let's see the predictions our model makes
y_pred_model_2 = model_2.predict(X_train)
pd.Series(y_pred_model_2).value_counts()

No     3207
Yes     730
dtype: int64

In [111]:
# Now let's measure the performance of this model
measure_model_performance(y_train, baseline, y_pred_model_1, y_pred_model_2, positive_label = 'Yes')

Unnamed: 0_level_0,accuracy,precision,recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.734315,0.0,0.0
1,0.79807,0.688722,0.437859
2,0.800356,0.678082,0.473231


In [112]:
# K Nearest Neighbors

model_3 = KNeighborsClassifier(n_neighbors = 10, weights = 'uniform')
model_3.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [113]:
y_pred_model_3 = model_3.predict(X_train)
pd.Series(y_pred_model_3).value_counts()

No     3167
Yes     770
dtype: int64

In [114]:
measure_model_performance(
    y_train,
    baseline,
    y_pred_model_1,
    y_pred_model_2,
    y_pred_model_3,
    positive_label = 'Yes'
)

Unnamed: 0_level_0,accuracy,precision,recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.734315,0.0,0.0
1,0.79807,0.688722,0.437859
2,0.800356,0.678082,0.473231
3,0.800864,0.67013,0.493308


In [125]:
# measure_model_performance(
#     y_validate,
#     baseline_prediction(y_validate),
#     model_1.predict(X_validate),
#     model_2.predict(X_validate),
#     model_3.predict(X_validate),
#     positive_label = 'Yes'
# )

In [120]:
# y_pred = baseline_pred = pd.Series([train['churn'].mode()[0]]).repeat(len(train))

In [153]:
knn1 = KNeighborsClassifier(n_neighbors = 7)
#Fir training set to our classifying object
knn1.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [154]:
#assign predictions to variables
y_pred_knn1 = knn1.predict(X_train)

measure_model_performance(y_train, baseline, y_pred_knn1, positive_label = 'Yes')

Unnamed: 0_level_0,accuracy,precision,recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.734315,0.0,0.0
1,0.810262,0.671642,0.559273


In [155]:
knn1.score(X_train, y_train)

0.810261620523241

In [162]:
# from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')
# Fit the model to our X and y training sets
logreg.fit(X_train, y_train)

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)


TypeError: '<' not supported between instances of 'str' and 'int'

In [158]:
# Decision Tree
model_1 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 6, min_samples_leaf = 1, min_samples_split = 30)
model_1.fit(X_train, y_train) 

In [None]:
#training and validate score, I want them to be better than the baseline and close to each other so I'm not overfitting
print(f'training score: {model_1.score(X_train, y_train):.2%}')
print(f'validate score: {model_1.score(X_validate, y_validate):.2%}')

In [None]:
y_pred_model_1 = y_pred = model_1.predict(X_train)

In [None]:
# Random Forest
model_2 = RandomForestClassifier(max_depth = 5, random_state = 123)
model_2.fit(X_train, y_train)

y_pred_model_2 = y_pred2 = model_2.predict(X_train)
pd.Series(y_pred_model_2).value_counts()

In [None]:
#training and validate score, I want them to be better than the baseline and close to each other so I'm not overfitting
print(f'training score: {model_2.score(X_train, y_train):.2%}')
print(f'validate score: {model_2.score(X_validate, y_validate):.2%}')

In [None]:
print(f'{model_metrics(X_train, y_train, y_pred2)}')
print(f'accuracy score is {round(accuracy_score(y_train, y_pred2),2)}')

In [None]:
# K-Nearest Neighbors
model_3 = KNeighborsClassifier(n_neighbors = 7, weights = 'uniform')
model_3.fit(X_train, y_train)

In [None]:
y_pred_model_3 = y_pred3 = model_3.predict(X_train)
pd.Series(y_pred_model_3).value_counts()

In [None]:
print(f'{model_metrics(X_train, y_train, y_pred3)}')
print(f'accuracy score is {round(accuracy_score(y_train, y_pred3),2)}')