# Assignment


## Nithin Reddy Muduganti

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score


np.random.seed(1)

### 2.0 Loading data 

In [2]:
df = pd.read_csv("UniversalBank.csv")

### Overview of data

In [3]:

df.head(3)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0


### Summary of the data

In [4]:
# Check the missing values by summing the total na's for each variable
df.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

## 3.0 Process the data

We dont have any categorical variables and we dont have any missing values so we need not do any covertion into numeric or we need not impute values 

### Dropping unessasary columns

Dropping unuseful data can help us to process the model quickly

In [5]:
 
df = df.drop(columns=['ID', 'ZIP Code'])

### Splitting the data into train and test sets

Lets split the data into training data and the test data with the ratio of 70-30

In [6]:
# split the data into validation and training set
train_df, test_df = train_test_split(df, test_size=0.3, random_state=1)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'CD Account'
predictors = list(df.columns)
predictors.remove(target)

In [7]:
# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
scaler.fit(train_df[predictors])

# Transform the predictors of training and test sets
X_train = scaler.transform(train_df[predictors]) 
y_train = train_df[target] 

X_test = scaler.transform(test_df[predictors])
y_test = test_df[target] 

##  Modeling


###  Logistic Regression using RandomSearch and Grid Search

In [8]:
score_measure = "recall"
kfolds = 3

param_grid = {'C':[0.001,0.01,0.1,1,5],
               'penalty':['l1', 'l2','elasticnet','none'],
              'solver':['saga','liblinear'],
              'max_iter': np.arange(500,1000)
                  
}

logistic_regression = LogisticRegression()
rand_search = RandomizedSearchCV(estimator =logistic_regression, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1  
                                )

_ = rand_search.fit(X_train, y_train)



bestlogestic = rand_search.best_estimator_

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


546 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
168 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kanna\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kanna\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Kanna\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 457, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

-----------------------

In [9]:
score_measure = "recall"
kfolds = 3
best_penality = rand_search.best_params_['penalty']
best_solver = rand_search.best_params_['solver']
min_regulization_strength=rand_search.best_params_['C']
min_iter = rand_search.best_params_['max_iter']

param_grid = {
    
    'C':np.arange(min_regulization_strength-0.05,min_regulization_strength+0.05), 
               'penalty':[best_penality],
              'solver':[best_solver],
              'max_iter': np.arange(min_iter-200,min_iter+200)
}

logistic_gridsearch =  LogisticRegression()
grid_search = GridSearchCV(estimator = logistic_gridsearch, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1 # n_jobs=-1 will utilize all available CPUs 
                )

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestlgr = grid_search.best_estimator_

Fitting 3 folds for each of 400 candidates, totalling 1200 fits
The best recall score is 0.6666666666666666
... with parameters: {'C': 0.05, 'max_iter': 732, 'penalty': 'l1', 'solver': 'saga'}


In [10]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")
Recall_lr={TP/(TP+FN)}

Accuracy=0.9780000 Precision=1.0000000 Recall=0.6024096 F1=0.7518797


In [11]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

In [12]:
performance = pd.concat([performance, pd.DataFrame({'model':"logistic using random & grid search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

###  Modeling the data using individual logestic regression models

####  Fit and test a Logistic Regression model

In [13]:
log_reg_model = LogisticRegression(penalty='none', max_iter=900)
_ = log_reg_model.fit(X_train, np.ravel(y_train))

In [14]:
model_preds = log_reg_model.predict(X_test)
c_matrix_1 = confusion_matrix(y_test, model_preds)
TP = c_matrix_1[1][1]
TN = c_matrix_1[0][0]
FP = c_matrix_1[0][1]
FN = c_matrix_1[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"default logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,logistic using random & grid search,0.978,1.0,0.60241,0.75188
0,default logistic,0.978,1.0,0.60241,0.75188


#### Change to liblinear solver

In [15]:
log_reg_liblin_model = LogisticRegression(solver='liblinear').fit(X_train, np.ravel(y_train))

In [16]:
model_preds = log_reg_liblin_model.predict(X_test)
c_matrix_2 = confusion_matrix(y_test, model_preds)
TP = c_matrix_2[1][1]
TN = c_matrix_2[0][0]
FP = c_matrix_2[0][1]
FN = c_matrix_2[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"liblinear logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,logistic using random & grid search,0.978,1.0,0.60241,0.75188
0,default logistic,0.978,1.0,0.60241,0.75188
0,liblinear logistic,0.978,1.0,0.60241,0.75188


####  L2 Regularization

In [17]:
log_reg_L2_model = LogisticRegression(penalty='l2', max_iter=1000)
_ = log_reg_L2_model.fit(X_train, np.ravel(y_train))

In [18]:
model_preds_3 = log_reg_L2_model.predict(X_test)
c_matrix_3 = confusion_matrix(y_test, model_preds)
TP = c_matrix_3[1][1]
TN = c_matrix_3[0][0]
FP = c_matrix_3[0][1]
FN = c_matrix_3[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"L2 logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance


Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,logistic using random & grid search,0.978,1.0,0.60241,0.75188
0,default logistic,0.978,1.0,0.60241,0.75188
0,liblinear logistic,0.978,1.0,0.60241,0.75188
0,L2 logistic,0.978,1.0,0.60241,0.75188


####  L1 Regularization

In [19]:
log_reg_L1_model = LogisticRegression(solver='liblinear', penalty='l1')
_ = log_reg_L1_model.fit(X_train, np.ravel(y_train))

In [20]:
model_preds = log_reg_L1_model.predict(X_test)
c_matrix_4 = confusion_matrix(y_test, model_preds)
TP = c_matrix_4[1][1]
TN = c_matrix_4[0][0]
FP = c_matrix_4[0][1]
FN = c_matrix_4[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"L1 logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,logistic using random & grid search,0.978,1.0,0.60241,0.75188
0,default logistic,0.978,1.0,0.60241,0.75188
0,liblinear logistic,0.978,1.0,0.60241,0.75188
0,L2 logistic,0.978,1.0,0.60241,0.75188
0,L1 logistic,0.978,1.0,0.60241,0.75188


####  Elastic Net Regularization

In [21]:
log_reg_elastic_model = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.5, max_iter=1000)
_ = log_reg_elastic_model.fit(X_train, np.ravel(y_train))

In [22]:
model_preds = log_reg_elastic_model.predict(X_test)
c_matrix_5 = confusion_matrix(y_test, model_preds)
TP = c_matrix_5[1][1]
TN = c_matrix_5[0][0]
FP = c_matrix_5[0][1]
FN = c_matrix_5[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Elestic logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,logistic using random & grid search,0.978,1.0,0.60241,0.75188
0,default logistic,0.978,1.0,0.60241,0.75188
0,liblinear logistic,0.978,1.0,0.60241,0.75188
0,L2 logistic,0.978,1.0,0.60241,0.75188
0,L1 logistic,0.978,1.0,0.60241,0.75188
0,Elestic logistic,0.978,1.0,0.60241,0.75188


####  Summary for logistic model

In [23]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,logistic using random & grid search,0.978,1.0,0.60241,0.75188
0,default logistic,0.978,1.0,0.60241,0.75188
0,liblinear logistic,0.978,1.0,0.60241,0.75188
0,L2 logistic,0.978,1.0,0.60241,0.75188
0,L1 logistic,0.978,1.0,0.60241,0.75188
0,Elestic logistic,0.978,1.0,0.60241,0.75188


###  Model the data using the SVM models

###  SVM using RandomSearch and Grid Search

In [24]:
score_measure = "recall"
kfolds = 3

param_grid = {'C':np.arange(0.1,50,10),  
               'kernel':['linear', 'rbf','poly'],
              'gamma':['scale','auto'],
              'degree':np.arange(1,10), 
              'coef0':np.arange(1,10) 
                  
}

svc = SVC()
rand_search = RandomizedSearchCV(estimator =svc, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1  
                                )

_ = rand_search.fit(X_train, y_train)

)

bestsvc = rand_search.best_estimator_

SyntaxError: unmatched ')' (2821398556.py, line 19)

In [None]:
score_measure = "recall"
kfolds = 3
best_kernel = rand_search.best_params_['kernel']
best_gamma = rand_search.best_params_['gamma']
min_regulization=rand_search.best_params_['C']
best_degree = rand_search.best_params_['degree']
best_coef0=rand_search.best_params_['coef0']

param_grid = {
    
    'C':np.arange(min_regulization-3,min_regulization+3), 
               'kernel':[best_kernel],
              'gamma':[best_gamma],
              'degree': np.arange(best_degree-1,best_degree+1),
            'coef0': np.arange(best_coef0-3,best_coef0+3)
}

svm_grid =  SVC()
grid_search = GridSearchCV(estimator = svm_grid, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1 # n_jobs=-1 will utilize all available CPUs 
                )

_ = grid_search.fit(X_train, y_train)



best_svm = grid_search.best_estimator_

In [None]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")
Recall_svm={TP/(TP+FN)}

In [None]:
performance_svm = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

In [None]:
performance_svm = pd.concat([performance_svm, pd.DataFrame({'model':"svm using Random & Grid search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])


###  Modeling the Data using indivdual SVM models

###  Fit a SVM classification model using linear kernal

In [None]:
svm_lin_model = SVC(kernel="linear")
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [None]:
model_preds = svm_lin_model.predict(X_test)
c_matrix_6 = confusion_matrix(y_test, model_preds)
TP = c_matrix_6[1][1]
TN = c_matrix_6[0][0]
FP = c_matrix_6[0][1]
FN = c_matrix_6[1][0]
performance_svm = pd.concat([performance_svm, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance_svm

###  Fit a SVM classification model using rbf kernal

In [None]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale')
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [None]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix_7 = confusion_matrix(y_test, model_preds)
TP = c_matrix_7[1][1]
TN = c_matrix_7[0][0]
FP = c_matrix_7[0][1]
FN = c_matrix_7[1][0]
performance_svm = pd.concat([performance_svm, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance_svm

###  Fit a SVM classification model using polynomial kernal¶

In [None]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [None]:
model_preds = svm_poly_model.predict(X_test)
c_matrix_8 = confusion_matrix(y_test, model_preds)
TP = c_matrix_8[1][1]
TN = c_matrix_8[0][0]
FP = c_matrix_8[0][1]
FN = c_matrix_8[1][0]
performance_svm = pd.concat([performance_svm, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance_svm

### Summary of the SVM models

In [None]:
performance_svm.sort_values(by=['Recall'])

###  Decision Trees using RandomSearchCV combined with GridSearchCV

In [None]:
score_measure = "recall"
kfolds = 3
param_grid = {
    'min_samples_split': np.arange(1,50),  
    'min_samples_leaf': np.arange(1,50),
    'min_impurity_decrease': np.arange(0.0001, 0.0005),
    'max_leaf_nodes': np.arange(5, 50), 
    'max_depth': np.arange(1,25), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)



bestRecallTree = rand_search.best_estimator_

In [None]:
score_measure = "recall"
kfolds = 5
min_samples_split = rand_search.best_params_['min_samples_split']
min_samples_leaf = rand_search.best_params_['min_samples_leaf']
min_impurity_decrease = rand_search.best_params_['min_impurity_decrease']
max_leaf_nodes = rand_search.best_params_['max_leaf_nodes']
max_depth = rand_search.best_params_['max_depth']
criterion = rand_search.best_params_['criterion']
param_grid = {
    'min_samples_split': np.arange(min_samples_split-2,min_samples_split+2),  
    'min_samples_leaf': np.arange(min_samples_leaf-2,min_samples_leaf+2),
    'min_impurity_decrease': np.arange(min_impurity_decrease-0.0001, min_impurity_decrease+0.0001, 0.00005),
    'max_leaf_nodes': np.arange(max_leaf_nodes-2,max_leaf_nodes+2), 
    'max_depth': np.arange(max_depth-2,max_depth+2), 
    'criterion': [criterion]
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)



bestRecallTree = grid_search.best_estimator_

In [None]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

## Conclusion

From the above performed models here by we can conclude that decision tree has the highest recall value when compared to other models so the decision tree model is thr best fit for thr data provided .