In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [97]:
from sklearn.datasets import make_classification

In [98]:
##create data set
X,y=make_classification(n_samples=10000,n_features=9,n_classes=2,random_state=10)

In [99]:
X

array([[ 0.01049154,  0.0712608 ,  1.39618732, ..., -0.81138137,
         0.25575807,  0.15527746],
       [ 1.00825243, -1.7280295 ,  0.21856251, ...,  0.95096608,
        -1.56032093, -0.13689442],
       [ 0.87141638,  1.61370724,  0.87944987, ..., -0.24171359,
        -1.17834992, -0.02025366],
       ...,
       [ 0.00257745, -0.07646561, -0.73138247, ...,  0.34330138,
         1.05925984,  0.61221117],
       [ 1.18014314, -0.90496077, -1.29959902, ...,  1.66683246,
        -0.00451555,  0.8893553 ],
       [-0.27951659,  0.7158506 ,  0.38820998, ..., -1.00274428,
         1.29567962,  0.53521009]])

In [100]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.010492,0.071261,1.396187,0.015948,-1.024967,-1.919238,-0.811381,0.255758,0.155277
1,1.008252,-1.728030,0.218563,0.493309,-2.463297,-1.495218,0.950966,-1.560321,-0.136894
2,0.871416,1.613707,0.879450,0.433127,0.497656,-0.932479,-0.241714,-1.178350,-0.020254
3,-0.750835,-0.101601,0.848645,-0.498455,-0.318193,0.819132,-1.452964,-2.135008,-1.797509
4,0.513058,-0.415994,1.174886,0.333950,1.018621,-0.090795,1.632411,1.291566,1.131871
...,...,...,...,...,...,...,...,...,...
9995,2.125151,-0.912806,-0.259173,1.200072,0.189292,-0.350654,0.187887,0.742643,2.034051
9996,1.581945,-1.321442,0.361602,0.760923,1.163935,1.187242,0.365527,-2.777034,-0.404271
9997,0.002577,-0.076466,-0.731382,0.043538,-0.448980,0.975959,0.343301,1.059260,0.612211
9998,1.180143,-0.904961,-1.299599,0.649849,0.253537,-1.712137,1.666832,-0.004516,0.889355


In [101]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [102]:
##Model training
from sklearn.linear_model import LogisticRegression
logistic=LogisticRegression()


In [103]:
logistic.fit(X_train,y_train)

In [104]:
y_pred=logistic.predict(X_test)

In [105]:
print(y_pred)

[0 0 1 ... 0 0 1]


In [106]:
logistic.predict_proba(X_test)

array([[9.98581896e-01, 1.41810445e-03],
       [9.99961048e-01, 3.89519218e-05],
       [4.24998526e-03, 9.95750015e-01],
       ...,
       [7.77001125e-01, 2.22998875e-01],
       [9.66687005e-01, 3.33129952e-02],
       [3.13126531e-01, 6.86873469e-01]])

In [107]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [108]:
score=accuracy_score(y_test,y_pred)
print(score)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(classification_report(y_test,y_pred))

0.899
[[1276  171]
 [ 132 1421]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      1447
           1       0.89      0.92      0.90      1553

    accuracy                           0.90      3000
   macro avg       0.90      0.90      0.90      3000
weighted avg       0.90      0.90      0.90      3000



##Hyper Paramter and cross validation using GridSearch CV

In [109]:
##hyper parameter tuning and cross validation
model=LogisticRegression()
penalty=['l1','l2','elasticnet']
c_values=[100,10,1,0.1,0.01]
solver=['newton-cg','lbfgs','liblinear','sag','saga']

In [110]:
params=dict(penalty=penalty,C=c_values,solver=solver)

In [111]:
from sklearn.model_selection import StratifiedKFold
cv=StratifiedKFold()

In [112]:
##Grid Search CV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
grid=GridSearchCV(estimator=model,param_grid=params,scoring='accuracy',cv=5)

In [113]:
grid

In [114]:
grid.fit(X_train,y_train)

200 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 56, in _check_solver
  

In [115]:
grid.best_params_

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

In [116]:
grid.best_score_


0.8974285714285714

In [117]:
y_pred=grid.predict(X_test)

In [118]:
score=accuracy_score(y_test,y_pred)
print(score)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(classification_report(y_test,y_pred))

0.899
[[1277  170]
 [ 133 1420]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      1447
           1       0.89      0.91      0.90      1553

    accuracy                           0.90      3000
   macro avg       0.90      0.90      0.90      3000
weighted avg       0.90      0.90      0.90      3000



# **USing Randomized Search CV**

In [119]:
from sklearn.model_selection import RandomizedSearchCV

In [120]:
model=LogisticRegression()

In [121]:
randomcv=RandomizedSearchCV(estimator=model,param_distributions=params,scoring='accuracy',cv=5)

In [122]:
randomcv.fit(X_train,y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 56, in _check_solver
    r

In [123]:
randomcv.best_score_

0.8972857142857142

In [124]:
randomcv.best_params_

{'solver': 'lbfgs', 'penalty': 'l2', 'C': 10}

In [125]:
y_pred=randomcv.predict(X_test)

In [126]:
score=accuracy_score(y_test,y_pred)
print(score)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(classification_report(y_test,y_pred))

0.899
[[1276  171]
 [ 132 1421]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      1447
           1       0.89      0.92      0.90      1553

    accuracy                           0.90      3000
   macro avg       0.90      0.90      0.90      3000
weighted avg       0.90      0.90      0.90      3000

