# Churn Analysis

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("user_satisfaction_survey.csv")

In [3]:
data.head()

Unnamed: 0,ID,Classes_per_week,Happy_with_instructors,Happy_with_class_duration,Happy_with_class_timings,Happy_with_class_size,Happy_with_facilities,Happy_with_price,Churn
0,100,3,Yes,Yes,Yes,Yes,Yes,Yes,No
1,101,4,Yes,No,Yes,No,No,No,No
2,102,3,No,Yes,No,Yes,Yes,Yes,Yes
3,103,2,No,Yes,No,Yes,No,Yes,Yes
4,104,4,Yes,Yes,Yes,Yes,Yes,Yes,No


In [4]:
X = data.drop(['Churn','ID'], axis=1) #Dropping the ID column also - its not needed
y = data['Churn']

In [5]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()



OneHotEncoder will be used to turn the categorical (yes, no) to numerical

In [6]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

Column_transformer will be used to apply the OneHotEncoder to all the columns except Classes_per_week (that's already numerical)

In [34]:
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer((OneHotEncoder(),['Happy_with_instructors', 'Happy_with_class_duration',\
                                                       'Happy_with_class_timings', 'Happy_with_class_size',\
                                                       'Happy_with_facilities', 'Happy_with_price']), \
                                       remainder='passthrough')

Uncomment code below to see the column_transformer applying the OneHotEncoder to data 

In [40]:
column_trans.fit_transform(X)


array([[0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 4.],
       [1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 3.],
       [1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 2.],
       [0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 4.],
       [1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 3.],
       [0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1.],
       [0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 2.],
       [0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 3.],
       [1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 4.],
       [0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 2.],
       [0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 3.],
       [0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1.],
       [0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 4.],
       [0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 2.],
       [0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1.],
       [0., 1., 1., 0., 

Use pipeline to chain steps together. Applies 'column_transform' to data and passes it through decision tree classifier

In [9]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(column_trans, dtree)

Apply 5-fold cross validation to pipeline and getting the mean accuracy 

In [24]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.96

Taking a sample from our data to apply it to the model, since we know already the result!

In [25]:
X_new = X.sample(1)
X_new

Unnamed: 0,Classes_per_week,Happy_with_instructors,Happy_with_class_duration,Happy_with_class_timings,Happy_with_class_size,Happy_with_facilities,Happy_with_price
5,3,No,Yes,Yes,No,Yes,No


Train our pipleline with the data

In [26]:
pipe.fit(X,y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['Happy_with_instructors'...
                

Use our pipeline to make a prediction

In [27]:
pipe.predict(X_new)

array(['No'], dtype=object)

Use random samples and make preditions of it

In [48]:
x_new_random = X.sample(10, random_state=99) #first number indicates how many new samples
x_new_random

Unnamed: 0,Classes_per_week,Happy_with_instructors,Happy_with_class_duration,Happy_with_class_timings,Happy_with_class_size,Happy_with_facilities,Happy_with_price
17,2,Yes,Yes,Yes,Yes,Yes,Yes
14,2,Yes,Yes,Yes,Yes,Yes,Yes
2,3,No,Yes,No,Yes,Yes,Yes
10,2,Yes,Yes,Yes,Yes,Yes,Yes
6,1,Yes,Yes,Yes,Yes,Yes,No
11,3,Yes,Yes,Yes,Yes,Yes,Yes
15,1,Yes,Yes,Yes,Yes,No,Yes
0,3,Yes,Yes,Yes,Yes,Yes,Yes
18,3,No,No,No,Yes,Yes,Yes
13,4,Yes,Yes,Yes,Yes,Yes,Yes


In [49]:
pipe.predict(x_new_random)

array(['No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No'],
      dtype=object)

Add some data by hand to predict a 'No'

In [28]:
sample1 = [3,'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No']
sample1 = pd.DataFrame([sample1], columns = X.columns)
sample1

Unnamed: 0,Classes_per_week,Happy_with_instructors,Happy_with_class_duration,Happy_with_class_timings,Happy_with_class_size,Happy_with_facilities,Happy_with_price
0,3,Yes,Yes,Yes,Yes,No,No


In [29]:
pipe.predict(sample1)

array(['No'], dtype=object)

Add some more data to predict a 'Yes'

In [30]:
sample2 = [2,'No', 'Yes', 'No', 'No', 'Yes', 'No']
sample2 = pd.DataFrame([sample2], columns = X.columns)
sample2

Unnamed: 0,Classes_per_week,Happy_with_instructors,Happy_with_class_duration,Happy_with_class_timings,Happy_with_class_size,Happy_with_facilities,Happy_with_price
0,2,No,Yes,No,No,Yes,No


In [31]:
pipe.predict(sample2)

array(['Yes'], dtype=object)

Source: https://www.youtube.com/watch?v=irHhDMbw3xo