In [92]:
import pandas as pd
import numpy as np

import zgulde.extend_pandas

from pydataset import data

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [49]:
df = data('swiss')

In [50]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [51]:
df.Catholic.describe()

count     47.00000
mean      41.14383
std       41.70485
min        2.15000
25%        5.19500
50%       15.14000
75%       93.12500
max      100.00000
Name: Catholic, dtype: float64

In [52]:
df.loc[df['Catholic'] > 40, 'is_catholic'] = 'Catholic'
df.loc[df['Catholic'] < 40, 'is_catholic'] = 'Not Catholic'


In [54]:
df.drop(columns='Catholic', inplace=True)

In [56]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality,is_catholic
Courtelary,80.2,17.0,15,12,22.2,Not Catholic
Delemont,83.1,45.1,6,9,22.2,Catholic
Franches-Mnt,92.5,39.7,5,5,20.2,Catholic
Moutier,85.8,36.5,12,7,20.3,Not Catholic
Neuveville,76.9,43.5,17,15,20.6,Not Catholic


In [142]:
X = df[['Education', 'Fertility']]
y = df[['is_catholic']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123, stratify=df.is_catholic)

X_train.head()

Unnamed: 0,Education,Fertility
Moutier,7,85.8
La Vallee,20,54.3
Yverdon,8,65.4
Neuveville,15,76.9
La Chauxdfnd,11,65.7


In [143]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)

In [144]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [145]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['Not Catholic', 'Not Catholic', 'Not Catholic', 'Not Catholic',
       'Not Catholic'], dtype=object)

In [146]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.44444444, 0.55555556],
       [1.        , 0.        ],
       [0.44444444, 0.55555556],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.44444444, 0.55555556],
       [0.        , 1.        ],
       [0.44444444, 0.55555556],
       [0.44444444, 0.55555556],
       [0.44444444, 0.55555556],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.44444444, 0.55555556],
       [1.        , 0.        ],
       [0.44444444, 0.55555556],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.44444444, 0.55555556],
       [1.

In [147]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.88


In [148]:
confusion_matrix(y_train, y_pred)

array([[ 9,  4],
       [ 0, 19]])

In [149]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

    Catholic       1.00      0.69      0.82        13
Not Catholic       0.83      1.00      0.90        19

   micro avg       0.88      0.88      0.88        32
   macro avg       0.91      0.85      0.86        32
weighted avg       0.90      0.88      0.87        32



In [124]:
X = df[['Agriculture', 'Examination']]
y = df[['is_catholic']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123, stratify=df.is_catholic)

X_train.head()

Unnamed: 0,Agriculture,Examination
Moutier,36.5,12
La Vallee,15.2,31
Yverdon,49.5,15
Neuveville,43.5,17
La Chauxdfnd,7.7,29


In [125]:
logit = LogisticRegression(random_state=123)

In [126]:
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [127]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.01903912  0.10283509]]
Intercept: 
 [-0.17445896]


In [128]:
y_pred = logit.predict(X_train)

In [129]:
y_pred_proba = logit.predict_proba(X_train)

In [130]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.84


In [122]:
print(confusion_matrix(y_train, y_pred))

[[ 9  4]
 [ 1 18]]


In [123]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

    Catholic       0.90      0.69      0.78        13
Not Catholic       0.82      0.95      0.88        19

   micro avg       0.84      0.84      0.84        32
   macro avg       0.86      0.82      0.83        32
weighted avg       0.85      0.84      0.84        32



In [137]:
X = df[['Fertility', 'Infant.Mortality']]
y = df[['is_catholic']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123, stratify=df.is_catholic)

X_train.head()

Unnamed: 0,Fertility,Infant.Mortality
Moutier,85.8,20.3
La Vallee,54.3,10.8
Yverdon,65.4,22.5
Neuveville,76.9,20.6
La Chauxdfnd,65.7,20.5


In [138]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [139]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [140]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

In [141]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.78


In [98]:
print(confusion_matrix(y_train, y_pred))

[[ 7  5]
 [ 3 17]]


In [99]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

    Catholic       0.70      0.58      0.64        12
Not Catholic       0.77      0.85      0.81        20

   micro avg       0.75      0.75      0.75        32
   macro avg       0.74      0.72      0.72        32
weighted avg       0.75      0.75      0.74        32



In [150]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.73


In [153]:
from sklearn.datasets import load_iris

import graphviz

from graphviz import Graph

dot_data = tree.export_graphviz(clf, out_file=None, rounded=True) 
graph = graphviz.Source(dot_data) 

graph.render('swiss_decision_tree', view=True, cleanup=True)

'swiss_decision_tree.pdf'