In [1]:
#list all of the imports I used in this notebook or in a module
import pandas as pd
import numpy as np

#import model classifier
from sklearn.ensemble import RandomForestClassifier

#import my personal modules created for this report
import wrangle as w
import explore as e
import model as m

In [2]:
#import unclean data from Codeup mySQL server
df = w.get_telco_data()

In [3]:
df.drop_duplicates(inplace=True)
df = df[df.total_charges!=' ']
df.total_charges = df.total_charges.astype(float)
df['churn_encoded'] = df.churn.map({'Yes': 1, 'No': 0})
df.drop(columns=['payment_type_id', 'internet_service_type_id','contract_type_id', 'churn'], inplace=True)

In [4]:
train, validate, test = w.train_validate_test_split(df, "churn_encoded")

In [5]:
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,contract_type,internet_service_type,payment_type,churn_encoded
22,0027-KWYKW,Female,0,Yes,Yes,23,Yes,Yes,No,No,...,No,Yes,No,Yes,83.75,1849.95,Month-to-month,Fiber optic,Electronic check,0
4999,7025-WCBNE,Male,1,No,No,47,Yes,Yes,No,Yes,...,Yes,No,No,No,59.6,2754.0,Two year,DSL,Bank transfer (automatic),0
2563,3665-JATSN,Female,0,No,No,19,No,No phone service,No,Yes,...,No,No,Yes,Yes,39.7,710.05,Month-to-month,DSL,Electronic check,0
5501,7746-AWNQW,Female,0,No,No,28,No,No phone service,No,No,...,Yes,No,No,Yes,35.75,961.4,Month-to-month,DSL,Mailed check,0
4344,6128-AQBMT,Male,1,Yes,No,41,No,No phone service,No,Yes,...,No,Yes,Yes,Yes,53.95,2215.4,Month-to-month,DSL,Electronic check,0


In [6]:
customer_id = test.customer_id

In [7]:
drivers = ['churn_encoded', 'tenure', 'monthly_charges', 'contract_type', 'tech_support']

In [8]:
X_train, y_train, X_validate, y_validate, X_test, y_test = m.prep_for_model(train, validate, test, "churn_encoded", drivers)

In [9]:
# create classifier object
rf = RandomForestClassifier(max_depth=4, random_state=27)

#fit model on training data
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, random_state=27)

In [10]:
#get predictions
predictions = rf.predict(X_test)

In [11]:
#get probabilities
probabilities = rf.predict_proba(X_test)[:,1]

In [12]:
probabilities, predictions

(array([0.54563406, 0.35325739, 0.31711266, ..., 0.6261496 , 0.32259533,
        0.01307474]),
 array([1, 0, 0, ..., 1, 0, 0]))

In [13]:
final_csv = pd.DataFrame({'customer_id':customer_id, 
                     'probability_of_churn':probabilities, 
                     'predictions_of_churn':predictions})
final_csv.to_csv('predictions.csv', index=False)