# Import Libraries

In [None]:
# Preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Model Validation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection  import KFold

# Load Data

In [None]:
# Load data to dataframe
data =pd.read_excel('C:\\Users\\Acer\\OneDrive\\Desktop\\Bank\\yes_bank.xlsx')

In [None]:
# Shape of dataset
data.shape

In [None]:
# Check Column details and count of rows
data.columns,len(data)

In [None]:
#### Description about columns ####

# RowNumber       - Row number in dataset, 
# CustomerId      - CustomerId given by bank, 
# Surname         - Name of account holder,
# CreditScore     - Credit score given by the bank based on the usage of credit card and other factors,
# Geography       - Main branch of the account,
# Gender          - Gender of the customer,
# Age             - Age of the customer,
# Tenure          - How long a person, acting as a customer to the bank,
# Balance         - Balance in the a/c,
# NumOfProducts   - No: of products the customer opted,
# HasCrCard       - Whether the customer is having yes bank's credit card or not,
# IsActiveMember  - Has made any transaction in last month,
# EstimatedSalary - Salary of the customer,
# Exited          - Whether still a person is a customer of yes bank or not

In [None]:
# Head of Dataset (Top 5 Rows)
data.head()

# Delete Unwanted columns from dataframe

In [None]:
# Deleting unwanted columns which are not corellated to target variable
# Delete Row Number, CustomerId & Surname
data=data.iloc[:,3:14]
print(data.columns)
print(data.head(5))

### 1. Outlier Treatment

In [None]:
# Checking datatypes to identify contineous variables
print(data.dtypes)

In [None]:
# Seprate Continuous Variable in different dataframe
columns=['CreditScore','Age', 'Balance', 'EstimatedSalary']
df=data.loc[:, columns]

In [None]:
# Function to Plot (boxplot & histogram)
def box_hist_fun(datacolumn):
    
    # Cut the window in 2 parts
    f,(ax_box, ax_hist) = plt.subplots(2, sharex=True)
    # Add a graph in each part
    sns.boxplot(datacolumn, ax=ax_box)
    sns.distplot(datacolumn.dropna(), ax=ax_hist,kde=False)
    # Remove x axis name for the boxplot
    ax_box.set(xlabel='')

In [None]:
# Function to plot graph for all variables    
def plot_graph(df):
    
    for i in range(0,len(df.columns)):
        if i<len(df[df.columns]):
            box_hist_fun(df[df.columns[i]])

In [None]:
# Plot graph to identify outliers
plot_graph(df)

In [None]:
# Outlier treatment for 'Age'
agemedian=df['Age'].median(skipna=True)
df['Age'] = np.where(df['Age'] >57, agemedian,df['Age'])
data['Age'] = np.where(data['Age'] >57, agemedian,data['Age'])

In [None]:
# Outlier treatment for 'CreditScore'
df.drop(df[df['CreditScore']<383].index, inplace=True)
df = df.reset_index(drop=True)
data.drop(data[data['CreditScore']<383].index, inplace=True)
data = data.reset_index(drop=True)

In [None]:
# Convert Gender (Categorical) into Continuous data
gender=data.Gender
cat_gen=[]
for x in gender:
    if x=="Male":
        cat_gen.append(1)
    else:
        cat_gen.append(0)

In [None]:
# Visualizing categorical variables
%matplotlib inline
fig=plt.figure(figsize=(20,5))

plt.subplot(2,5,1)
plt.title('NumOfProducts')
plt.hist(list(data['NumOfProducts']))
plt.subplot(2,5,2)
plt.title('HasCrCard')
plt.hist(list(data['HasCrCard']))
plt.subplot(2,5,3)
plt.title('IsActiveMember')
plt.hist(list(data['IsActiveMember']))
plt.subplot(2,5,4)
plt.title('Tenure')
plt.hist(list(data['Tenure']))
plt.subplot(2,5,5)
plt.title('Gender')
plt.hist(list(cat_gen))
plt.show()

In [None]:
# Ploting to visualize outlier after treatment
plot_graph(df)

In [None]:
# Function for Outlier Percentage 
def outlier_percent(df):
    
    i=0
    for i in range(len(df.columns)):
            df1=df[df.columns[i]]
            df1=df1.dropna()
            df1=np.array(df1)
            upper_quartile =np.percentile(df1, 75)
            lower_quartile = np.percentile(df1, 25)

            iqr = upper_quartile - lower_quartile

            upper_whisker = df1[df1<=upper_quartile+1.5*iqr].max()
            lower_whisker =  df1[df1>=lower_quartile-1.5*iqr].min()
            
            countofmaxoutlier=np.count_nonzero(df1[ np.where( df1 >  upper_whisker)])/len(df)
            countofminoutlier=np.count_nonzero(df1[ np.where( df1 <  lower_whisker)])/len(df)
            outlierpercent=round((countofmaxoutlier+countofminoutlier)*100,2)
            
            print(df.columns[i],'\t',outlierpercent,'%')

In [None]:
# Final checking of outlier percentage
outlier_percent(df)

### 2. Null Value Treatment 

In [None]:
# Check Null Values
data.isnull().sum()

In [None]:
# Function to find the percent of null or nan values in data
def null_percent(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    return percent_missing

In [None]:
# Checking the percentage of null values
null_percent(data)

### 3. Dealing with Data types

In [None]:
data.CreditScore=data.CreditScore.astype('float64')
data.Tenure=data.Tenure.astype('float64')
data.NumOfProducts=data.NumOfProducts.astype('float64')
data.HasCrCard=data.HasCrCard.astype('float64')
data.IsActiveMember=data.IsActiveMember.astype('float64')
data.Exited=data.Exited.astype('float64')
print(data.dtypes)

In [None]:
# Store all Numerical variable into dataframe 'df_numeric' 
numericlist=['CreditScore', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Age', 'Balance', 'EstimatedSalary', 'Exited']
df_numeric=data.loc[:, numericlist]

In [None]:
# Store all Object variables into dataframe 'df_dummies'
# if variable is Ordered Categorical use - Label encoding 
# if variable is UnOrdered use - get_dummies()
columns=['Geography', 'Gender']
df_dummies=data.loc[:, columns]

In [None]:
# Useage of get_dummies()
data_cat_dummies=pd.get_dummies(df_dummies,drop_first=True)

In [None]:
# Concatenate Categorical and Numerical dataframe 
prepro_data=pd.concat([data_cat_dummies,df_numeric],axis=1)
print(prepro_data.dtypes)
print(prepro_data.columns)

In [None]:
data1 = prepro_data

In [None]:
# Describe Data
prepro_data.describe()

# Splitting into Train and Test

In [None]:
y = prepro_data.loc[:,'Exited'].values
X = prepro_data.loc[:,prepro_data.columns!='Exited'].values

X_train, X_test, y_train,y_test = train_test_split(X,y, test_size = 0.2,random_state=0)
print("Training set: ", X_train.shape, y_train.shape)
print("Test set: ", X_test.shape, y_test.shape)

# Feature Engineering 

In [None]:
#Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Building & Evaluation

## Getting the learning curves

In [None]:
from sklearn.model_selection import learning_curve
def get_learning_curve(model,name):
  train_size, train_score, test_score = learning_curve(estimator=model, X=X, y=y, cv=10 )
  train_score_m = np.mean(train_score, axis=1)
  test_score_m = np.mean(test_score, axis=1)
  plt.plot(train_size, train_score_m, 'o-', color="b")
  plt.plot(train_size, test_score_m, 'o-', color="r")
  plt.legend(('Training score', 'Test score'), loc='best')
  plt.xlabel("Training Samples")
  plt.ylabel("Score")
  title_text = "Learning curve for "+name
  plt.title(title_text)
  plt.grid()
  plt.show()

## Model Results compilation Data Frame

In [None]:
col_names = ['Model','Accuracy','Score','Precision','F1 Score']
compare = pd.DataFrame(columns = col_names)
compare.head()

## User Defined Scoring function

In [None]:
def get_scores(arr):
  TP = arr[0][0]
  FP = arr[1][1]
  TN = arr[1][0]
  FN = arr[0][1]
  acc = (TP+FP)/(TP+FP+TN+FN)
  pre = TP/(TP+FP)
  rec = TP/(TP+FN)
  f1 = 2*((pre*rec)/(pre+rec))
  return acc, pre, rec, f1

## Print Accuracy

In [None]:
def get_results(cmatrix,scores):
  print(cmatrix)
  print("Mean Accuracy is                     :",np.mean(scores))
  print("Standard Deviation of accuracies is  :",np.std(scores))
  cmatrix = cmatrix.to_numpy()
  return cmatrix

## 1. Logistic Regression

In [None]:
#Without KFold Cross validation 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Modeel Accuracy",model.score(X_test, y_test))

In [None]:
#Without KFold Cross validation with hyper-parameter optimization
from sklearn.model_selection import GridSearchCV
grid={"C":np.logspace(0.1,1.0,5), "penalty":["l1","l2"]}# l1 lasso l2 ridge
model=LogisticRegression()
model_cv=GridSearchCV(model,grid,cv=10)
model_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

In [None]:
# After Optimization
model = LogisticRegression(C=1.0, penalty= "l2")
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))
get_learning_curve(model,"Logistic Regression")

In [None]:
# Stratified K Fold CV

from sklearn.model_selection import KFold
model = LogisticRegression(C=1.0, penalty= "l2")
model.fit(X_train, y_train)
array = [[0,0],[0,0]]
scores = []
cv = KFold(n_splits = 10, random_state=42, shuffle = False)
for train_index, test_index in cv.split(X):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    c = confusion_matrix(y_test, model.predict(X_test))
    array = array + c
cm = pd.DataFrame(array, index = ['1', '0'], columns = ['1', '0'])

In [None]:
# Print Results and Put the results into the dataframe
cm = get_results(cm,scores)
acc, pre, rec, f1 = get_scores(cm)
compare.loc[len(compare)] = ["Logistic", round(acc,2), round(pre,2), round(rec,2), round(f1,2)]
compare.head()

## 2. Decision Tree

In [None]:
#Without KFold Cross validation
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))

In [None]:
#Without KFold Cross validation with hyper-parameter optimization
from sklearn.model_selection import GridSearchCV
depths = np.arange(1, 21)
num_leafs = [5, 10, 20, 50, 100]
parameters={'max_depth': depths, 'max_leaf_nodes': num_leafs}
model = DecisionTreeClassifier()
model_cv=GridSearchCV(model, param_grid=parameters,cv=10)
model_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

### How gridsearch works
1. Try every combination of your parameter grid
2. For each of them it will do a K-fold cross validation - By default CV = 3
3. Select the best available.

In [None]:
# After Optimization
model = DecisionTreeClassifier(max_depth=8, max_leaf_nodes=50)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))
get_learning_curve(model,"Decision Tree")

In [None]:
# Stratified K Fold CV
## Go through uniform and stratified k fold.
from sklearn.model_selection import KFold
model = DecisionTreeClassifier(max_depth=8, max_leaf_nodes=50)
model.fit(X_train, y_train)
array = [[0,0],[0,0]]
scores = []
cv = KFold(n_splits = 10, random_state=42, shuffle = False)
for train_index, test_index in cv.split(X):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    c = confusion_matrix(y_test, model.predict(X_test))
    array = array + c
cm = pd.DataFrame(array, index = ['1', '0'], columns = ['1', '0'])

In [None]:
# Print Results and Put the results into the dataframe
cm = get_results(cm,scores)
acc, pre, rec, f1 = get_scores(cm)
compare.loc[len(compare)] = ["Decision Tree", round(acc,2), round(pre,2), round(rec,2), round(f1,2)]
compare.head()

## 3. Naive Bayes

In [None]:
# Before Hyper-parameter optimization
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
model = GaussianNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))
get_learning_curve(model,"Naive Bayes")

In [None]:
#There isn't a hyper-parameter to tune, so you have nothing to grid search over.

In [None]:
# Stratified K Fold CV
## Go through uniform and stratified k fold.
from sklearn.model_selection import KFold
model = GaussianNB()
model.fit(X_train, y_train)
array = [[0,0],[0,0]]
scores = []
cv = KFold(n_splits = 10, random_state=42, shuffle = False)
for train_index, test_index in cv.split(X):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    c = confusion_matrix(y_test, model.predict(X_test))
    array = array + c
cm = pd.DataFrame(array, index = ['1', '0'], columns = ['1', '0'])

In [None]:
# Print Results and Put the results into the dataframe
cm = get_results(cm,scores)
acc, pre, rec, f1 = get_scores(cm)
compare.loc[len(compare)] = ["Naive Bayes", round(acc,2), round(pre,2), round(rec,2), round(f1,2)]
compare.head()

## 4. K-Nearest Neighbour classifier (kNN)

In [None]:
#Without KFold Cross validation 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

model = KNeighborsClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))

In [None]:
error_rate = []

# K
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.figure(figsize=(15,20))
plt.plot(range(1,50), error_rate, color='blue', linestyle='dashed',marker='o', markerfacecolor='red',markersize=10)
plt.title('Error_rate vs k-value')
plt.xlabel('k')
plt.ylabel('Error rate')

In [None]:
#Without KFold Cross validation with hyper-parameter optimization
from sklearn.model_selection import GridSearchCV
k_range = list(range(1,31))
parameters = {'n_neighbors':k_range,
              'leaf_size':[1,3,5],
              'algorithm':['auto', 'kd_tree'],
              'weights': ['uniform', 'distance']}

model= KNeighborsClassifier()
model_cv=GridSearchCV(model, param_grid=parameters,cv=10)
model_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

In [None]:
# After Optimization
model = KNeighborsClassifier(algorithm="auto", leaf_size=1, weights='uniform', n_neighbors= 30)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))
get_learning_curve(model,"KNN")

In [None]:
# Stratified K Fold CV
## Go through uniform and stratified k fold.
from sklearn.model_selection import KFold
model = KNeighborsClassifier(algorithm="auto", leaf_size=1, weights='uniform', n_neighbors= 30)
model.fit(X_train, y_train)
array = [[0,0],[0,0]]
scores = []
cv = KFold(n_splits = 10, random_state=42, shuffle = False)
for train_index, test_index in cv.split(X):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    c = confusion_matrix(y_test, model.predict(X_test))
    array = array + c
cm = pd.DataFrame(array, index = ['1', '0'], columns = ['1', '0'])

In [None]:
# Print Results and Put the results into the dataframe
cm = get_results(cm,scores)
acc, pre, rec, f1 = get_scores(cm)
compare.loc[len(compare)] = ["KNN", round(acc,2), round(pre,2), round(rec,2), round(f1,2)]
compare.head()

## 5. XGBoost

In [None]:
# Without KFold Cross validation 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))

In [None]:
# Without KFold Cross validation with hyper-parameter optimization
from sklearn.model_selection import GridSearchCV
params = {
        'min_child_weight': [1, 2, 3, 5, 10],
        'gamma': [0.5, 1, 2.2, 2.3, 5],
        'max_depth': [3, 4, 5],
        'reg_lambda': [0.1,0.3,0.5]
        }

model= XGBClassifier()
model_cv=GridSearchCV(model, param_grid=parameters,cv=10)
model_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

In [None]:
# After Optimization
model = XGBClassifier(algorithm= 'auto', min_child_weight= 1, gamma=2.3 , max_depth=5 , reg_lambda=0.5)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))
get_learning_curve(model,"XG Boost")

In [None]:
# Stratified K Fold CV
## Go through uniform and stratified k fold.
from sklearn.model_selection import KFold
model = XGBClassifier(algorithm= 'auto', min_child_weight= 1, gamma=2.3 , max_depth=5 , reg_lambda=0.5)
model.fit(X_train, y_train)
array = [[0,0],[0,0]]
scores = []
cv = KFold(n_splits = 10, random_state=42, shuffle = False)
for train_index, test_index in cv.split(X):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    c = confusion_matrix(y_test, model.predict(X_test))
    array = array + c
cm = pd.DataFrame(array, index = ['1', '0'], columns = ['1', '0'])

In [None]:
# Print Results and Put the results into the dataframe
cm = get_results(cm,scores)
acc, pre, rec, f1 = get_scores(cm)
compare.loc[len(compare)] = ["XG Boost", round(acc,2), round(pre,2), round(rec,2), round(f1,2)]
compare.head()

## 6. Random Forest

In [None]:
#Without KFold Cross validation
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))

In [None]:
#Without KFold Cross validation with hyper-parameter optimization
from sklearn.model_selection import GridSearchCV
parameters = { 
    'n_estimators': [10, 20, 40, 60, 80, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'bootstrap': [True],
    'criterion' :['gini', 'entropy'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12]
}
model = RandomForestClassifier()
model_cv=GridSearchCV(model, param_grid=parameters,cv=10)
model_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

In [None]:
# After Optimization
model = RandomForestClassifier(criterion="entropy", max_depth = 8, max_features="sqrt", n_estimators=40, bootstrap= True, min_samples_leaf= 4, min_samples_split= 8)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))
get_learning_curve(model,"Random Forest")

In [None]:
# Stratified K Fold CV
## Go through uniform and stratified k fold.
from sklearn.model_selection import KFold
model = model = RandomForestClassifier(criterion="entropy", max_depth = 8, max_features="sqrt", n_estimators=40, bootstrap= True, min_samples_leaf= 4, min_samples_split= 8)
model.fit(X_train, y_train)
array = [[0,0],[0,0]]
scores = []
cv = KFold(n_splits = 10, random_state=42, shuffle = False)
for train_index, test_index in cv.split(X):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    c = confusion_matrix(y_test, model.predict(X_test))
    array = array + c
cm = pd.DataFrame(array, index = ['1', '0'], columns = ['1', '0'])

In [None]:
# Print Results and Put the results into the dataframe
cm = get_results(cm,scores)
acc, pre, rec, f1 = get_scores(cm)
compare.loc[len(compare)] = ["Random Forest", round(acc,2), round(pre,2), round(rec,2), round(f1,2)]
compare.head(6)

## 7. Support Vector Machine (SVM)

In [None]:
# Before Hyper-parameter optimization
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))

In [None]:
#Without KFold Cross validation with hyper-parameter optimization
from sklearn.model_selection import GridSearchCV
parameters= {'kernel':['linear','rbf'], 'C':[1,0.25,0.5,0.75],'gamma':[1,2,3,'auto'], 
            'decision_function_shape':('ovo','ovr'),
            'shrinking':(True,False)}
model = SVC()
model_cv=GridSearchCV(model, param_grid= parameters,cv=10)
model_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

In [None]:
# Above code reuired lots of time. 

In [None]:
# After Hyper-parameter optimization
model = SVC(C=1.0 , gamma= 'auto', kernel='rbf', decision_function_shape='ovr', shrinking=True)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(confusion_matrix(y_test, model.predict(X_test)))
print(" Model Accuracy",model.score(X_test, y_test))
get_learning_curve(model,"SVM")

In [None]:
# Stratified K Fold CV
## Go through uniform and stratified k fold.
from sklearn.model_selection import KFold
model = SVC(C=1.0 , gamma= 'auto', kernel='rbf', decision_function_shape='ovr', shrinking=True)
model.fit(X_train, y_train)
array = [[0,0],[0,0]]
scores = []
cv = KFold(n_splits = 10, random_state=42, shuffle = False)
for train_index, test_index in cv.split(X):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(model.score(X_test, y_test))
    c = confusion_matrix(y_test, model.predict(X_test))
    array = array + c
cm = pd.DataFrame(array, index = ['1', '0'], columns = ['1', '0'])

In [None]:
# Print Results and Put the results into the dataframe
cm = get_results(cm,scores)
acc, pre, rec, f1 = get_scores(cm)
compare.loc[len(compare)] = ["SVM", round(acc,2), round(pre,2), round(rec,2), round(f1,2)]
compare

# Deployment of the model

## Step 1
# 1) app.py - for generating REST APIs - go through
# 2) FLASK / DJango 
# 3) Index HTML - Form 
# 4) request file
# 5) model.py --> model.pkl
# 6) requierment -> update with the versions of python packages 

#Step 2: Code compilation - python app.py ==> URL Local host

#Step 3: Upload the files on github

#Step 4: Connect github to Heroku and Deploy to get global URL