In [14]:
from sklearn.model_selection import train_test_split #train, test, split
from sklearn.impute import SimpleImputer # impute
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Acquire process:

In [15]:
#import needed libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from pydataset import data


In [3]:
# acquire
from env import host, user, password
import acquire

In [4]:
# Create helper function to get the necessary connection url.

def get_connection(db_name):
    '''
    This function uses my info from my env file to
    create a connection url to access the Codeup db.
    '''
    from env import host, user, password
    return f'mysql+pymysql://{user}:{password}@{host}/{db_name}'


In [5]:
#acquire function
def get_telco_churn_data():
    '''
    This function reads in the iris data from the Codeup db
    and returns a pandas DataFrame with all columns.
    '''
    
    sql_query = '''
    SELECT *
    FROM customers
    JOIN contract_types ON customers.contract_type_id = contract_types.contract_type_id
    JOIN payment_types ON customers.payment_type_id = payment_types.payment_type_id
    JOIN internet_service_types ON customers.internet_service_type_id = internet_service_types.internet_service_type_id
    '''
    return pd.read_sql(sql_query, get_connection('telco_churn'))

_________

## Prepare

In [6]:
#import prepare functions
import prepare

In [7]:
def prep_telco_churn(df):
    '''
    This function takes in the telco_churn df acquired by get_telco_churn_data
    Returns the telco_churn df.
    '''
    # drop duplicate columns from join
    df = df.loc[:, ~df.columns.duplicated()]
    
    # change data types
    df.total_charges = df.total_charges.str.replace(' ', '0').astype(float)
    df.telco.replace({'churn': {'No':0, 'Yes':1}}, inplace=True)
    
    #drop duplicates
    df.drop_duplicates(inplace=True)

    #create dummies
    dummy_df = pd.get_dummies(df[['internet_service_type_id','contract_type_id']], dummy_na = False, drop_first=[True,True])
    #concat dummies with original df
    df= pd.concat([df, dummy_df], axis=1)

    return df

In [8]:
#the following cells will be included in the prepare.py

In [9]:
#get rude of nulls - there are currently 11
#telco.is_null.sum()
#telco = telco.replace(' ', np.nan)
#telco.isnull().any()
#telco.dropna(inplace=True)

In [10]:
#change total charges from an object to a float- put that in PREPARE
telco.total_charges = telco.total_charges.str.replace(' ', '0').astype(float)

In [11]:
##need to convert churn to int
telco_change = telco.replace({'churn': {'No':0, 'Yes':1}}, inplace=True)
## ^can only run line once

NameError: name 'telco' is not defined

In [None]:
#drop columns not needed
telco = telco.drop(['customer_id'], axis=1)

In [None]:
telco.head()

_________________

## Explore the data

In [None]:
#rename data to 'telco'
telco = get_telco_churn_data()

In [None]:
#look at data
telco.head()

In [None]:
#this data appears to have no nulls
#data has int and object data types
telco.info()

### data summary:
- 18 object data types
- 8 integer data types
- 1 float data type
- 0 null values


In [None]:
#descriptive statistics
telco.describe()

_______

### get curious about the data

In [None]:
#we are trying to determine churn... look into that
telco.churn.value_counts()

#this shows 26.54% of customers churn (1869 out of 7043)

In [None]:
#find ou how many are senior citizens and how many are not
telco.senior_citizen.value_counts()

#this shows that 16.21% of all customers are seniors (1142 out of 7043)

In [None]:
#average tenure, max tenure, min tenure??
telco.tenure.mean(), telco.tenure.max(), telco.tenure.min()

In [None]:
#average monthly charges, max monthly charges, min monthly charges
telco.monthly_charges.mean(), telco.monthly_charges.max(), telco.monthly_charges.min()

In [None]:
#group by churn and senior_citizen 
#senior_churn= telco.groupby(['churn','senior_citizen'])

_________

In [None]:
#gettting value counts for churn as int not obj
(telco.churn == 'Yes').astype(int).value_counts()

In [None]:
telco.head()

In [None]:
telco.head()

In [None]:
#show all column names for next steps of value count loop
telco.columns

In [None]:
#identify columns for loop
columns = ['gender', 'senior_citizen', 'partner', 'dependents', 'tenure',
       'phone_service', 'multiple_lines', 'internet_service_type_id',
       'online_security', 'online_backup', 'device_protection', 'tech_support',
       'streaming_tv', 'streaming_movies', 'contract_type_id',
       'paperless_billing', 'payment_type_id', 'monthly_charges',
       'total_charges', 'churn', 'contract_type_id', 'contract_type',
       'payment_type_id', 'payment_type', 'internet_service_type_id',
       'internet_service_type']

In [None]:
#create for loop to print value counts / and percents
for col in columns:
    print(col)
    print(telco[col].value_counts())
    print('----------------------------------------')
    print(telco[col].value_counts(normalize=True))
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

### Data Prep

In [None]:
#find correlation- I think age correlates the most to churn
telco_correlation = telco.corr()
telco_correlation
#this shows that senior_citizen and monthly_charges have the highest pos corr with churn

In [None]:
#this gives the all the correlation with JUST churn
telco_corr_churn = telco_correlation['churn']
telco_corr_churn

## again, senior_citizen is the highest
##THIS will indict my DRIVER of churn!!

In [None]:
#this plot shows senior citizen (1) vs non-senior citizen (0)
## who churn (1) vs do not churn (0)
sns.countplot(x='churn', hue='senior_citizen', data=telco)

In [None]:
#heatmap to show correlation of all data
plt.figure(figsize=(16,9))

sns.heatmap(telco.corr(), cmap='YlGnBu', center=0, annot=True)

plt.title('Correlation of Telco Data')

plt.show()

_______

## Modeling/ Testing

### Find the appropriate model to use
- churn (categorical) and senior_citizen (categorical)
- 2 discrete variables
- use chi2 testing

In [None]:
#create confusion matrix
observed = pd.crosstab(telco.churn, telco.senior_citizen)
observed

## Hypothosis:

- **$H_{0}$**: there is no relationship between churn and customer age
<br>

- **$H_{a}$**: there is a relationshp between churn and customer age
<br>

- **True Positive**: Predict there is a relationship and there is a relationship
<br>

- **True Negative**:Predict there is no relationship and there is not relationship
<br>

- **False Positive**: Predict there is a relationship and there is no relationship
<br>

- **False Negative**: Predict there is no relationship and there is a relationship

In [None]:
#set alpha
alpha = 0.05

In [None]:
#chi2 contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2, p, degf, expected

In [None]:
## make it easier to read
print('Observed\n')
print(observed.values)
print('---------------------\nExpected\n')
print(expected.astype(int))
print('---------------------\n')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

In [None]:
if p< alpha:
    print('We reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')

### We reject $H_0$. Therefore, we believe that there is a relationship between churn and customer age

_________________________________

### Train Split

In [None]:
import prepare
import acquire

In [13]:
train, test = train_test_split(telco, test_size=.2, random_state=123)
train, validate = train_test_split(train, test_size=.3, random_state=123)

NameError: name 'telco' is not defined

In [None]:
## Explore and graph with JUST train set

In [None]:
## crosstab, confusion matrix

In [None]:
pd.crosstab(train.churn, train.model5)

In [None]:
# make baseline

In [None]:
X_col= ['senior_citizen','tenure', 'internet_service_type_id', 'contract_type_id']
y_col= 'churn'

In [None]:
X_train = train[X_col]
y_train= train[y_col]

X_validate = validate[X_col]
y_validate= validate[y_col]

X_test = test[X_col]
y_test= test[y_col]