In [28]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [29]:
# Loading the dataset from the CSV file named 'Social_Network_Ads.csv'
dataset=pd.read_csv("Social_Network_Ads.csv")

In [30]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [31]:
# Perform one-hot encoding on categorical variables and drop the first category to avoid the dummy variable trap (multicollinearity)
dataset=pd.get_dummies(dataset,drop_first=True)

In [32]:
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,True
1,15810944,35,20000,0,True
2,15668575,26,43000,0,False
3,15603246,27,57000,0,False
4,15804002,19,76000,0,True
...,...,...,...,...,...
395,15691863,46,41000,1,False
396,15706071,51,23000,1,True
397,15654296,50,20000,1,False
398,15755018,36,33000,0,True


In [33]:
# Drop the 'User ID' column as it is just an identifier and not useful for prediction
dataset = dataset.drop("User ID", axis=1)

In [34]:
dataset

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,True
1,35,20000,0,True
2,26,43000,0,False
3,27,57000,0,False
4,19,76000,0,True
...,...,...,...,...
395,46,41000,1,False
396,51,23000,1,True
397,50,20000,1,False
398,36,33000,0,True


In [35]:
# Display the count of each class in the target variable 'Purchased' (0 = Not Purchased, 1 = Purchased)
dataset["Purchased"].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [36]:
# Define independent variables/features for the model
indep = dataset[["Age", "EstimatedSalary", "Gender_Male"]]

# Define dependent variable/target for prediction
dep = dataset["Purchased"]

In [37]:
# Check the shape (rows, columns) of the independent variables dataframe
indep.shape

(400, 3)

In [38]:
# Display the target variable 'Purchased' values
dep

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [39]:
# Split data into training and test sets
# test_size=1/3 means 33% data for testing, 67% for training
# random_state=0 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size=1/3, random_state=0)

In [40]:
# Instantiate a StandardScaler to standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler to the training data and transform it (apply scaling)
X_train = sc.fit_transform(X_train)

# Use the same scaler to transform the test data (use the mean and variance learned from training data)
X_test = sc.transform(X_test)

In [41]:
param_grid = {
    'solver':     ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'penalty':    ['l1', 'l2', 'elasticnet', 'none'],   # Not all solvers support all penalties!
    'C':          [0.01, 0.1, 1, 10, 100],              # Inverse of regularization strength
    'max_iter':   [100, 200, 500],                      # Maximum iterations for the solver
    'fit_intercept': [True, False],                     # Whether to fit the intercept (bias)
    'l1_ratio':   [0, 0.5, 1]                           # Used only if penalty='elasticnet'
}

# Set up grid search cross-validation for Logistic Regression:
# - Uses the specified param_grid to search for the best combination of hyperparameters
# - refit=True: refit the model on the entire training data using the best found parameters
# - verbose=3: prints detailed progress messages while fitting
# - n_jobs=-1: uses all available CPU cores to speed up the search
# - scoring='f1_weighted': evaluates models based on the weighted F1-score
grid = GridSearchCV(
    LogisticRegression(),
    param_grid,
    refit=True,
    verbose=3,
    n_jobs=-1,
    scoring='f1_weighted'
)

# Fit the grid search to the training data (this will try all combinations in param_grid)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


4050 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [42]:
# Print the best hyperparameter combination found during grid search
print(grid.best_params_)

# Get the full cross-validation results as a dictionary (contains scores, params, etc.)
re = grid.cv_results_

# Optionally, you can print all cross-validation results for analysis
#print(re)

# Use the best estimator found by grid search to make predictions on the test data
grid_predictions = grid.predict(X_test)

{'C': 0.1, 'fit_intercept': True, 'l1_ratio': 0, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [43]:
# Compute the confusion matrix to evaluate the performance of the best logistic regression model on the test set
cm = confusion_matrix(y_test, grid_predictions)

In [44]:
# Generate a detailed classification report (precision, recall, f1-score, support) 
# to evaluate the performance of the best logistic regression model on the test set
clf_report = classification_report(y_test, grid_predictions)

In [45]:
# Calculate the weighted F1-score for the test set predictions made by the best model
f1_macro = f1_score(y_test, grid_predictions, average='weighted')

# Print the best hyperparameters and the corresponding F1-score
print("The weighted F1-score for the best parameters {}:".format(grid.best_params_), f1_macro)

The weighted F1-score for the best parameters {'C': 0.1, 'fit_intercept': True, 'l1_ratio': 0, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}: 0.8872628037179481


In [46]:
# Print the confusion matrix for the model's predictions on the test set
print("The confusion Matrix:\n", cm)

The confusion Matrix:
 [[79  6]
 [ 9 40]]


In [47]:
# Print the detailed classification report showing precision, recall, f1-score, and support for each class
print("The report:\n", clf_report)

The report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.91        85
           1       0.87      0.82      0.84        49

    accuracy                           0.89       134
   macro avg       0.88      0.87      0.88       134
weighted avg       0.89      0.89      0.89       134



In [48]:
# Calculate the ROC AUC score using the predicted probabilities for the positive class (class 1)
# This measures the model's ability to distinguish between classes (higher is better)
roc_auc = roc_auc_score(y_test, grid.predict_proba(X_test)[:, 1])

In [49]:
# Convert the cross-validation results dictionary (from grid search) into a pandas DataFrame for easier analysis and visualization
table = pd.DataFrame.from_dict(re)
# Display the DataFrame containing all cross-validation results from the grid search
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_fit_intercept,param_l1_ratio,param_max_iter,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001909,0.000501,0.000000,0.000000,0.01,True,0.0,100,l1,newton-cg,"{'C': 0.01, 'fit_intercept': True, 'l1_ratio':...",,,,,,,,631
1,0.001404,0.000495,0.000000,0.000000,0.01,True,0.0,100,l1,lbfgs,"{'C': 0.01, 'fit_intercept': True, 'l1_ratio':...",,,,,,,,631
2,0.003998,0.000891,0.004350,0.000888,0.01,True,0.0,100,l1,liblinear,"{'C': 0.01, 'fit_intercept': True, 'l1_ratio':...",0.509779,0.525300,0.501410,0.501410,0.501410,0.507862,0.009302,586
3,0.004403,0.000488,0.004577,0.001031,0.01,True,0.0,100,l1,saga,"{'C': 0.01, 'fit_intercept': True, 'l1_ratio':...",0.509779,0.525300,0.501410,0.501410,0.501410,0.507862,0.009302,586
4,0.007921,0.001604,0.004917,0.000978,0.01,True,0.0,100,l2,newton-cg,"{'C': 0.01, 'fit_intercept': True, 'l1_ratio':...",0.686728,0.701560,0.521359,0.796094,0.650266,0.671202,0.089053,556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,0.004372,0.003446,0.003304,0.000751,100.00,False,1.0,500,elasticnet,saga,"{'C': 100, 'fit_intercept': False, 'l1_ratio':...",0.836613,0.779327,0.719604,0.795722,0.852044,0.796662,0.046696,307
1436,0.001011,0.000306,0.000000,0.000000,100.00,False,1.0,500,none,newton-cg,"{'C': 100, 'fit_intercept': False, 'l1_ratio':...",,,,,,,,631
1437,0.000703,0.000604,0.000000,0.000000,100.00,False,1.0,500,none,lbfgs,"{'C': 100, 'fit_intercept': False, 'l1_ratio':...",,,,,,,,631
1438,0.001403,0.000797,0.000000,0.000000,100.00,False,1.0,500,none,liblinear,"{'C': 100, 'fit_intercept': False, 'l1_ratio':...",,,,,,,,631


In [None]:
# Get user input for prediction features

# Prompt the user to enter their age (as a float)
age_input = float(input("Age: "))

# Prompt the user to enter their Salary (as a float)
salary_input = float(input("Salary: "))

# Prompt the user to enter their sex (1 for male, 0 for female)
sex_male_input = int(input("Sex Male 0 or 1: "))

Age:  40
Salary:  4500000


In [None]:
# Use the trained and tuned grid search model to predict the outcome for new user input
# The input should be in the same order and format as the training features
Future_Prediction = grid.predict([[age_input, salary_input, smoker_yes_input]]) 

# Print the prediction result
print("Future_Prediction={}".format(Future_Prediction))