In [8]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [2]:
#create data set
from sklearn.datasets import make_classification

In [4]:
X, y = make_classification(n_samples=10000, n_features=10, n_classes=2, random_state=42, n_redundant=0, n_informative=10)

In [9]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.768806,1.552087,-1.108478,-0.222583,2.111739,-1.256546,-0.175853,0.939546,1.532594,0.456232
1,-2.814482,2.434166,-0.088742,-5.088769,-5.573471,3.794988,2.928596,1.034544,-1.557692,2.382256
2,1.753409,-1.329287,-2.081537,3.378618,-2.969198,0.763080,-3.109888,0.270133,-0.045036,2.582544
3,2.399019,0.096587,-2.601556,0.739843,-1.422688,0.487285,1.422313,0.895851,-0.531458,3.282606
4,-1.024728,-0.979284,0.220209,1.071751,-1.563860,-0.762324,0.106594,-3.242243,-1.300516,-0.711074
...,...,...,...,...,...,...,...,...,...,...
9995,2.740760,-1.559914,-2.080195,-1.591299,0.772201,-1.068437,-1.941011,-0.129600,2.062896,0.717479
9996,-0.819207,-1.572738,0.571984,2.584101,1.145266,-0.925422,2.133315,-2.995458,1.695210,2.677090
9997,-1.562184,0.561775,0.830508,-1.472768,-1.585923,2.322349,-0.624653,-1.664160,-2.483101,-1.967133
9998,1.781841,-0.868678,1.206692,-0.432161,2.912888,1.073146,1.840214,-2.032802,-0.786209,1.159424


In [10]:
y

array([0, 1, 1, ..., 1, 1, 0], shape=(10000,))

In [12]:
#train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)


X_train shape:  (7000, 10)
X_test shape:  (3000, 10)
y_train shape:  (7000,)
y_test shape:  (3000,)


In [13]:
#model training
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()


In [14]:
logistic.fit(X_train, y_train)

In [17]:
y_pred = logistic.predict(X_test)
print(y_pred)

[0 1 1 ... 0 0 0]


In [20]:
#performance matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [26]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n\n", classification_report(y_test, y_pred))

Accuracy:  0.824
Confusion matrix:
 
 [[1266  250]
 [ 278 1206]]
Classification report: 

               precision    recall  f1-score   support

           0       0.82      0.84      0.83      1516
           1       0.83      0.81      0.82      1484

    accuracy                           0.82      3000
   macro avg       0.82      0.82      0.82      3000
weighted avg       0.82      0.82      0.82      3000



## Hyper paramter tuning
Hyperparameter tuning is essential for optimizing the performance of Logistic Regression models. By tuning hyperparameters like regularization type and strength, you can prevent overfitting, improve generalization, and optimize for specific metrics relevant to your problem.

class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='deprecated', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)[source]


In [50]:
model = LogisticRegression()
penalty = ['l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1,0.01]
solver = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']

In [51]:
params = dict(penalty=penalty, C= c_values, solver= solver)  #to convert it into key value pair
params

{'penalty': ['l2', 'elasticnet'],
 'C': [100, 10, 1.0, 0.1, 0.01],
 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']}

### 1. Gridsearch CV


In [52]:

from sklearn.model_selection import GridSearchCV, StratifiedKFold
cv = StratifiedKFold()

grid = GridSearchCV(estimator=model, param_grid=params, cv=cv, n_jobs=-1, scoring='accuracy', error_score=0)
grid

It willl now give the best paramter for model

In [53]:
grid.fit(X_train, y_train)

125 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "e:\CODING\Data Science\DataScience ML Scratch2025\DS from Scratch\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\CODING\Data Science\DataScience ML Scratch2025\DS from Scratch\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "e:\CODING\Data Science\DataScience ML Scratch2025\DS from Scratch\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py

In [54]:
grid.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

In [55]:
grid.best_score_

np.float64(0.8147142857142857)

## 2. Randomised Search CV

In [58]:
from sklearn.model_selection import RandomizedSearchCV
model = LogisticRegression()
ranCV = RandomizedSearchCV(estimator=model, param_distributions=params, cv=5, n_jobs=-1, scoring='accuracy')
ranCV

In [59]:
ranCV.fit(X_train, y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "e:\CODING\Data Science\DataScience ML Scratch2025\DS from Scratch\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\CODING\Data Science\DataScience ML Scratch2025\DS from Scratch\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "e:\CODING\Data Science\DataScience ML Scratch2025\DS from Scratch\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py

In [60]:
ranCV.best_score_

np.float64(0.8142857142857143)

In [61]:
ranCV.best_params_

{'solver': 'sag', 'penalty': 'l2', 'C': 10}

In [64]:
y_pred = ranCV.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n\n", classification_report(y_test, y_pred))

Accuracy:  0.8243333333333334
Confusion matrix:
 
 [[1267  249]
 [ 278 1206]]
Classification report: 

               precision    recall  f1-score   support

           0       0.82      0.84      0.83      1516
           1       0.83      0.81      0.82      1484

    accuracy                           0.82      3000
   macro avg       0.82      0.82      0.82      3000
weighted avg       0.82      0.82      0.82      3000

