In [22]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pickle
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [23]:
# Loading the dataset from the CSV file named 'Social_Network_Ads.csv'
dataset=pd.read_csv("Social_Network_Ads.csv")

In [24]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [25]:
# Perform one-hot encoding on categorical variables and drop the first category to avoid the dummy variable trap (multicollinearity)
dataset=pd.get_dummies(dataset,drop_first=True)

In [26]:
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,True
1,15810944,35,20000,0,True
2,15668575,26,43000,0,False
3,15603246,27,57000,0,False
4,15804002,19,76000,0,True
...,...,...,...,...,...
395,15691863,46,41000,1,False
396,15706071,51,23000,1,True
397,15654296,50,20000,1,False
398,15755018,36,33000,0,True


In [27]:
# Drop the 'User ID' column as it is just an identifier and not useful for prediction
dataset = dataset.drop("User ID", axis=1)

In [28]:
dataset

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,True
1,35,20000,0,True
2,26,43000,0,False
3,27,57000,0,False
4,19,76000,0,True
...,...,...,...,...
395,46,41000,1,False
396,51,23000,1,True
397,50,20000,1,False
398,36,33000,0,True


In [29]:
# Display the count of each class in the target variable 'Purchased' (0 = Not Purchased, 1 = Purchased)
dataset["Purchased"].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [30]:
# Define independent variables/features for the model
indep = dataset[["Age", "EstimatedSalary", "Gender_Male"]]

# Define dependent variable/target for prediction
dep = dataset["Purchased"]

In [31]:
# Check the shape (rows, columns) of the independent variables dataframe
indep.shape

(400, 3)

In [32]:
# Display the target variable 'Purchased' values
dep

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [33]:
# Split data into training and test sets
# test_size=1/3 means 33% data for testing, 67% for training
# random_state=0 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size=1/3, random_state=0)

In [34]:
# Instantiate a StandardScaler to standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler to the training data and transform it (apply scaling)
X_train = sc.fit_transform(X_train)

# Use the same scaler to transform the test data (use the mean and variance learned from training data)
X_test = sc.transform(X_test)

In [35]:
param_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['auto', 'scale'],
    'C': [10, 100, 1000, 2000, 3000],
    'degree': [2, 3, 4],            # For 'poly' kernel
    'coef0': [0.0, 0.1, 0.5, 1.0]  # For 'poly' and 'sigmoid' kernels
    # You can add others if needed
}


grid = GridSearchCV(
    SVC(probability=True),  # <-- ADD probability=True here!
    param_grid,        # The dictionary of parameters to try (from your previous message)
    refit=True,        # After finding the best parameters, refit the model on the whole dataset
    verbose=3,         # Print detailed progress messages during the search (higher = more info)
    n_jobs=-1,         # Use all available CPU cores to speed up the search
    scoring='f1_weighted'  # Use the weighted F1 score to evaluate performance for each parameter set
)

# Fit the grid search to the training data (this will try all combinations in param_grid)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [36]:
# Print the best hyperparameter combination found during grid search
print(grid.best_params_)

# Get the full cross-validation results as a dictionary (contains scores, params, etc.)
re = grid.cv_results_

# Optionally, you can print all cross-validation results for analysis
#print(re)

# Use the best estimator found by grid search to make predictions on the test data
grid_predictions = grid.predict(X_test)

{'C': 1000, 'coef0': 0.5, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}


In [37]:
# Compute the confusion matrix to evaluate the performance of the best logistic regression model on the test set
cm = confusion_matrix(y_test, grid_predictions)

In [38]:
# Generate a detailed classification report (precision, recall, f1-score, support) 
# to evaluate the performance of the best logistic regression model on the test set
clf_report = classification_report(y_test, grid_predictions)

In [39]:
# Calculate the weighted F1-score for the test set predictions made by the best model
f1_macro = f1_score(y_test, grid_predictions, average='weighted')

# Print the best hyperparameters and the corresponding F1-score
print("The weighted F1-score for the best parameters {}:".format(grid.best_params_), f1_macro)

The weighted F1-score for the best parameters {'C': 1000, 'coef0': 0.5, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}: 0.9256751954513149


In [40]:
# Print the confusion matrix for the model's predictions on the test set
print("The confusion Matrix:\n", cm)

The confusion Matrix:
 [[79  6]
 [ 4 45]]


In [41]:
# Print the detailed classification report showing precision, recall, f1-score, and support for each class
print("The report:\n", clf_report)

The report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        85
           1       0.88      0.92      0.90        49

    accuracy                           0.93       134
   macro avg       0.92      0.92      0.92       134
weighted avg       0.93      0.93      0.93       134



In [42]:
# Calculate the ROC AUC score using the predicted probabilities for the positive class (class 1)
# This measures the model's ability to distinguish between classes (higher is better)
roc_auc_score(y_test,grid.predict_proba(X_test)[:,1])

0.9639855942376951

In [43]:
# Convert the cross-validation results dictionary (from grid search) into a pandas DataFrame for easier analysis and visualization
table = pd.DataFrame.from_dict(re)
# Display the DataFrame containing all cross-validation results from the grid search
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_coef0,param_degree,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.041805,0.024829,0.007442,0.002814,10,0.0,2,auto,linear,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.776290,0.790949,0.698235,0.923510,0.901744,0.818146,0.083619,221
1,0.023616,0.006475,0.009663,0.002638,10,0.0,2,auto,rbf,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.867478,0.886792,0.869709,0.944161,0.943041,0.902236,0.034431,41
2,0.028019,0.004716,0.011647,0.004755,10,0.0,2,auto,poly,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.799620,0.822155,0.753180,0.783837,0.783837,0.788526,0.022571,351
3,0.026463,0.009075,0.008518,0.003874,10,0.0,2,auto,sigmoid,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.762677,0.738916,0.655795,0.796284,0.766556,0.744045,0.047743,388
4,0.018418,0.001910,0.007458,0.001553,10,0.0,2,scale,linear,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.776290,0.790949,0.698235,0.923510,0.901744,0.818146,0.083619,221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0.013021,0.003534,0.003403,0.000370,3000,1.0,4,auto,sigmoid,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.582455,0.552427,0.556608,0.571900,0.587326,0.570143,0.013762,457
476,0.681973,0.140695,0.003158,0.000857,3000,1.0,4,scale,linear,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.776290,0.790949,0.698235,0.923510,0.901744,0.818146,0.083619,221
477,0.087539,0.017754,0.003803,0.000929,3000,1.0,4,scale,rbf,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.826263,0.866968,0.851527,0.847020,0.883278,0.855011,0.019206,187
478,4.605825,2.181529,0.003396,0.001021,3000,1.0,4,scale,poly,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.826263,0.846714,0.870362,0.906166,0.880769,0.866055,0.027573,175


In [None]:
# Get user input for prediction features

# Prompt the user to enter their age (as a float)
age_input = float(input("Age: "))

# Prompt the user to enter their Salary (as a float)
salary_input = float(input("Salary: "))

# Prompt the user to enter their sex (1 for male, 0 for female)
sex_male_input = int(input("Sex Male 0 or 1: "))

Age:  40


In [None]:
# Use the trained and tuned grid search model to predict the outcome for new user input
# The input should be in the same order and format as the training features
Future_Prediction = grid.predict([[age_input, salary_input, sex_male_input]]) 

# Print the prediction result
print("Future_Prediction={}".format(Future_Prediction))