<a href="https://colab.research.google.com/github/palanipsb/Python-Learning/blob/main/Classifier_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

In [5]:
churndf = pd.read_csv('/content/Churn_Modelling.csv')
churndf.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [6]:
churndf = churndf.dropna()

In [7]:
churndf = pd.get_dummies(churndf,drop_first=True,dtype=int)

In [9]:
x = churndf.drop(columns='Exited')
y = churndf['Exited']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)

In [11]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

In [12]:
from sklearn.model_selection import cross_val_score
for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 80.0 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 76.0 % accuracy score
Classifiers:  SVC Has a training score of 80.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 82.0 % accuracy score


In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
log_reg = grid_log_reg.best_estimator_
print(log_reg)

In [16]:
knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
grid_knears.fit(X_train, y_train)
knears_neighbors = grid_knears.best_estimator_
print(knears_neighbors)

KNeighborsClassifier(n_neighbors=4)


In [17]:
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train, y_train)
svc = grid_svc.best_estimator_
print(svc)

SVC(C=0.5)


In [19]:
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)),
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)
tree_clf = grid_tree.best_estimator_
print(tree_clf)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)


In [20]:
log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')
knears_score = cross_val_score(knears_neighbors, X_train, y_train, cv=5)
print('Knears Neighbors Cross Validation Score', round(knears_score.mean() * 100, 2).astype(str) + '%')
svc_score = cross_val_score(svc, X_train, y_train, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')
tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

Logistic Regression Cross Validation Score:  79.8%
Knears Neighbors Cross Validation Score 78.22%
Support Vector Classifier Cross Validation Score 79.8%
DecisionTree Classifier Cross Validation Score 84.0%


In [21]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
                             method="decision_function")
knears_pred = cross_val_predict(knears_neighbors, X_train, y_train, cv=5)
svc_pred = cross_val_predict(svc, X_train, y_train, cv=5,
                             method="decision_function")
tree_pred = cross_val_predict(tree_clf, X_train, y_train, cv=5)

In [22]:
print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
print('KNears Neighbors: ', roc_auc_score(y_train, knears_pred))
print('Support Vector Classifier: ', roc_auc_score(y_train, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train, tree_pred))

Logistic Regression:  0.5781477242431764
KNears Neighbors:  0.5032563562762961
Support Vector Classifier:  0.5564417723109358
Decision Tree Classifier:  0.6273748126901826


In [45]:
print(log_reg)
print(knears_neighbors)
print(svc)
print(tree_clf)

LogisticRegression(C=0.001)
KNeighborsClassifier(n_neighbors=4)
SVC(C=0.5)
DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)


In [60]:
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score,precision_score,recall_score
output = pd.DataFrame(columns = ['Model','Accuracy','Precision','Recall','F1 Score'])
clf_lst= [log_reg,knears_neighbors,svc,tree_clf]
for i in clf_lst:
  y_pred = i.predict(X_test)
  acc = accuracy_score(y_test,y_pred)
  prec = precision_score(y_test,y_pred)
  rec = recall_score(y_test,y_pred)
  f1 = f1_score(y_test,y_pred)
  output1 = pd.DataFrame([[type(i).__name__,acc,prec,rec,f1]],
                      columns = ['Model','Accuracy','Precision','Recall','F1 Score'])
  output = pd.concat([output,output1],ignore_index=True)

print(output)

                    Model  Accuracy  Precision    Recall  F1 Score
0      LogisticRegression     0.780   0.000000  0.000000  0.000000
1    KNeighborsClassifier     0.771   0.354839  0.050000  0.087649
2                     SVC     0.780   0.000000  0.000000  0.000000
3  DecisionTreeClassifier     0.834   0.950000  0.259091  0.407143
