In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('/content/drive/MyDrive/7_semestr/telecom_churn.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

In [4]:
data.drop(['state', 'voice mail plan', 'phone number'], axis=1, inplace=True)

In [5]:
data.head()

Unnamed: 0,account length,area code,international plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,128,415,no,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,415,no,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,137,415,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,84,408,yes,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,75,415,yes,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [6]:
data['international plan'] = data['international plan'].map( lambda x: 0 if x == 'no' else 1 )

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   account length          3333 non-null   int64  
 1   area code               3333 non-null   int64  
 2   international plan      3333 non-null   int64  
 3   number vmail messages   3333 non-null   int64  
 4   total day minutes       3333 non-null   float64
 5   total day calls         3333 non-null   int64  
 6   total day charge        3333 non-null   float64
 7   total eve minutes       3333 non-null   float64
 8   total eve calls         3333 non-null   int64  
 9   total eve charge        3333 non-null   float64
 10  total night minutes     3333 non-null   float64
 11  total night calls       3333 non-null   int64  
 12  total night charge      3333 non-null   float64
 13  total intl minutes      3333 non-null   float64
 14  total intl calls        3333 non-null   

In [8]:
y = data['churn'].astype('int')

In [9]:
x = data.drop('churn', axis=1)

In [10]:
x.shape, y.shape

((3333, 17), (3333,))

In [11]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import cross_val_score

In [12]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3, random_state=47)

In [13]:
x_train.shape, x_valid.shape

((2333, 17), (1000, 17))

In [14]:
decisionTreeClassifier = DecisionTreeClassifier(criterion='entropy', random_state=47)

In [15]:
np.mean(cross_val_score(decisionTreeClassifier, x_train, y_train, cv=10))

0.9181431348813323

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
knn = KNeighborsClassifier()

In [18]:
np.mean(cross_val_score(knn, x_train, y_train, cv=10))

0.8667198562048348

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
tree_params = {'max_depth': np.arange(1, 11), 'max_features': [0.7, 0.1, 1]}

In [21]:
gridTree = GridSearchCV(decisionTreeClassifier, tree_params, cv=10, n_jobs=-1)

In [22]:
%%time
gridTree.fit(x_train, y_train)

CPU times: user 532 ms, sys: 87.3 ms, total: 619 ms
Wall time: 11.1 s


In [23]:
gridTree.best_score_, gridTree.best_params_

(0.9421426213271706, {'max_depth': 8, 'max_features': 0.7})

In [24]:
np.arange?

In [25]:
knn_params = {'n_neighbors': list(range(1, 7)) + list(range(50, 100, 5)) }

In [26]:
gridKnn = GridSearchCV(knn, knn_params, cv=10, n_jobs=-1)

In [27]:
%%time
gridKnn.fit(x_train, y_train)

CPU times: user 346 ms, sys: 36.7 ms, total: 382 ms
Wall time: 7.02 s


In [28]:
gridKnn.best_score_, gridKnn.best_params_

(0.8722790800044018, {'n_neighbors': 6})

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
gridTree.predict(x_valid)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [31]:
gridTree.score(x_valid, y_valid)

0.941

In [32]:
accuracy_score(y_valid, gridTree.predict(x_valid))

0.941

In [34]:
1 - np.mean(y)

0.8550855085508551

In [36]:
from sklearn.tree import export_graphviz

In [38]:
export_graphviz(gridTree.best_estimator_, out_file='telecom_tree', feature_names=x.columns, filled=True)

In [39]:
!ls -l


total 20
drwx------ 5 root root  4096 Jan 23 13:12 drive
drwxr-xr-x 1 root root  4096 Dec 11 14:25 sample_data
-rw-r--r-- 1 root root 11866 Jan 23 14:03 telecom_tree


In [41]:
!dot -Tpng telecom_tree -o telecom_tree.png

<img src='telecom_tree.png'>
