In [60]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pickle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [61]:
# Loading the dataset from the CSV file named 'Social_Network_Ads.csv'
dataset=pd.read_csv("Social_Network_Ads.csv")

In [62]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [63]:
# Perform one-hot encoding on categorical variables and drop the first category to avoid the dummy variable trap (multicollinearity)
dataset=pd.get_dummies(dataset,drop_first=True)

In [64]:
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,True
1,15810944,35,20000,0,True
2,15668575,26,43000,0,False
3,15603246,27,57000,0,False
4,15804002,19,76000,0,True
...,...,...,...,...,...
395,15691863,46,41000,1,False
396,15706071,51,23000,1,True
397,15654296,50,20000,1,False
398,15755018,36,33000,0,True


In [65]:
# Drop the 'User ID' column as it is just an identifier and not useful for prediction
dataset = dataset.drop("User ID", axis=1)

In [66]:
dataset

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,True
1,35,20000,0,True
2,26,43000,0,False
3,27,57000,0,False
4,19,76000,0,True
...,...,...,...,...
395,46,41000,1,False
396,51,23000,1,True
397,50,20000,1,False
398,36,33000,0,True


In [67]:
# Display the count of each class in the target variable 'Purchased' (0 = Not Purchased, 1 = Purchased)
dataset["Purchased"].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [68]:
# Define independent variables/features for the model
indep = dataset[["Age", "EstimatedSalary", "Gender_Male"]]

# Define dependent variable/target for prediction
dep = dataset["Purchased"]

In [69]:
# Check the shape (rows, columns) of the independent variables dataframe
indep.shape

(400, 3)

In [70]:
# Display the target variable 'Purchased' values
dep

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [71]:
# Split data into training and test sets
# test_size=1/3 means 33% data for testing, 67% for training
# random_state=0 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size=1/3, random_state=0)

In [72]:
# Instantiate a StandardScaler to standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler to the training data and transform it (apply scaling)
X_train = sc.fit_transform(X_train)

# Use the same scaler to transform the test data (use the mean and variance learned from training data)
X_test = sc.transform(X_test)

In [73]:
param_grid = {
    # Function to measure the quality of a split
    'criterion': ['gini', 'entropy'],  
    
    # Number of features to consider at each split
    # 'sqrt': square root of total features
    # 'log2': log base 2 of total features
    'max_features': ['sqrt', 'log2'],
    
    # Strategy used to choose the split at each node
    # 'best': chooses the best split
    # 'random': chooses a random split
    'splitter': ['best', 'random'],
    
    # Maximum depth of the tree (None means unlimited depth)
    'max_depth': [None, 5, 10, 20],
    
    # Minimum number of samples required to split an internal node
    'min_samples_split': [2, 5, 10],
    
    # Minimum number of samples required to be at a leaf node
    'min_samples_leaf': [1, 2, 4],
}



# Set up GridSearchCV to find the best combination of parameters
grid = GridSearchCV(
    DecisionTreeClassifier(class_weight='balanced'),  # Handle class imbalance
    param_grid,                # Grid of hyperparameters to search
    refit=True,                # Refit best model on the full training set
    verbose=3,                 # Print progress
    n_jobs=-1,                 # Use all CPU cores
    scoring='f1_weighted'      # Good choice for imbalanced classification
)

# Fit the grid search to the training data (this will try all combinations in param_grid)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [74]:
# Print the best hyperparameter combination found during grid search
print(grid.best_params_)

# Get the full cross-validation results as a dictionary (contains scores, params, etc.)
re = grid.cv_results_

# Optionally, you can print all cross-validation results for analysis
#print(re)

# Use the best estimator found by grid search to make predictions on the test data
grid_predictions = grid.predict(X_test)

{'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'splitter': 'best'}


In [75]:
# Compute the confusion matrix to evaluate the performance of the best logistic regression model on the test set
cm = confusion_matrix(y_test, grid_predictions)

In [76]:
# Generate a detailed classification report (precision, recall, f1-score, support) 
# to evaluate the performance of the best logistic regression model on the test set
clf_report = classification_report(y_test, grid_predictions)

In [77]:
# Calculate the weighted F1-score for the test set predictions made by the best model
f1_macro = f1_score(y_test, grid_predictions, average='weighted')

# Print the best hyperparameters and the corresponding F1-score
print("The weighted F1-score for the best parameters {}:".format(grid.best_params_), f1_macro)

The weighted F1-score for the best parameters {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'splitter': 'best'}: 0.9038698827135286


In [78]:
# Print the confusion matrix for the model's predictions on the test set
print("The confusion Matrix:\n", cm)

The confusion Matrix:
 [[76  9]
 [ 4 45]]


In [79]:
# Print the detailed classification report showing precision, recall, f1-score, and support for each class
print("The report:\n", clf_report)

The report:
               precision    recall  f1-score   support

           0       0.95      0.89      0.92        85
           1       0.83      0.92      0.87        49

    accuracy                           0.90       134
   macro avg       0.89      0.91      0.90       134
weighted avg       0.91      0.90      0.90       134



In [80]:
# Calculate the ROC AUC score using the predicted probabilities for the positive class (class 1)
# This measures the model's ability to distinguish between classes (higher is better)
roc_auc_score(y_test,grid.predict_proba(X_test)[:,1])

0.9440576230492197

In [81]:
# Convert the cross-validation results dictionary (from grid search) into a pandas DataFrame for easier analysis and visualization
table = pd.DataFrame.from_dict(re)
# Display the DataFrame containing all cross-validation results from the grid search
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005545,0.002277,0.007931,0.000785,gini,,sqrt,1,2,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.826263,0.850809,0.833323,0.851527,0.885265,0.849437,0.020428,136
1,0.006848,0.002840,0.008973,0.001407,gini,,sqrt,1,2,random,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.849794,0.846714,0.738451,0.813179,0.847020,0.819031,0.042482,193
2,0.006878,0.002892,0.007993,0.002346,gini,,sqrt,1,5,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.822092,0.888107,0.833653,0.851527,0.865054,0.852087,0.023261,126
3,0.005206,0.001319,0.009918,0.000824,gini,,sqrt,1,5,random,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.851852,0.832918,0.796494,0.870362,0.961755,0.862676,0.055242,94
4,0.007722,0.001923,0.010264,0.004073,gini,,sqrt,1,10,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.774691,0.871337,0.833653,0.870362,0.962636,0.862536,0.061171,96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,0.007835,0.001147,0.008434,0.001165,entropy,20,log2,4,2,random,"{'criterion': 'entropy', 'max_depth': 20, 'max...",0.737140,0.804628,0.758030,0.925764,0.847020,0.814516,0.067391,199
284,0.005209,0.001810,0.009171,0.002695,entropy,20,log2,4,5,best,"{'criterion': 'entropy', 'max_depth': 20, 'max...",0.826263,0.850809,0.796494,0.906166,0.981233,0.872193,0.065332,71
285,0.004106,0.001247,0.005771,0.001434,entropy,20,log2,4,5,random,"{'criterion': 'entropy', 'max_depth': 20, 'max...",0.791752,0.846714,0.831098,0.718497,0.867097,0.811032,0.052447,202
286,0.004922,0.002327,0.006923,0.001724,entropy,20,log2,4,10,best,"{'criterion': 'entropy', 'max_depth': 20, 'max...",0.849794,0.907177,0.851527,0.925272,0.981233,0.903001,0.049222,2


In [82]:
# Get user input for prediction features

# Prompt the user to enter their age (as a float)
age_input = float(input("Age: "))

# Prompt the user to enter their Salary (as a float)
salary_input = float(input("Salary: "))

# Prompt the user to enter their sex (1 for male, 0 for female)
sex_male_input = int(input("Sex Male 0 or 1: "))

Age:  19
Salary:  19000
Sex Male 0 or 1:  0


In [83]:
# Use the trained and tuned grid search model to predict the outcome for new user input
# The input should be in the same order and format as the training features
Future_Prediction = grid.predict([[age_input, salary_input, sex_male_input]]) 

# Print the prediction result
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[1]


In [87]:

# 1. Define test samples
test_samples = pd.DataFrame([
    {"Age": 19, "EstimatedSalary": 19000, "Gender_Male": 1},
    {"Age": 26, "EstimatedSalary": 43000, "Gender_Male": 0},
    {"Age": 35, "EstimatedSalary": 20000, "Gender_Male": 1},
    {"Age": 27, "EstimatedSalary": 57000, "Gender_Male": 0},
    {"Age": 19, "EstimatedSalary": 76000, "Gender_Male": 1},
    {"Age": 50, "EstimatedSalary": 12000, "Gender_Male": 0},
    {"Age": 45, "EstimatedSalary": 140000, "Gender_Male": 1},
    {"Age": 35, "EstimatedSalary": 30000, "Gender_Male": 0}
])

# 2. Scale test data using the same StandardScaler used in training
scaled_test_samples = sc.transform(test_samples)

# 3. Predict with the best model
best_grid = grid.best_estimator_
print("✅ Best Model Found by GridSearchCV:\n", best_grid)

test_predictions = best_grid.predict(scaled_test_samples)

# 4. Display formatted results
for i, pred in enumerate(test_predictions):
    status = "🟢 Purchased" if pred == 1 else "🔵 Not Purchased"
    print(f"🔹 Test Case {i+1}: {status}")

# 5. Future single prediction with scaling
future_input = sc.transform([[19, 19000, 1]])
future_prediction = best_grid.predict(future_input)
print("Future_Prediction = {}".format(future_prediction))

✅ Best Model Found by GridSearchCV:
 DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5)
🔹 Test Case 1: 🔵 Not Purchased
🔹 Test Case 2: 🔵 Not Purchased
🔹 Test Case 3: 🔵 Not Purchased
🔹 Test Case 4: 🔵 Not Purchased
🔹 Test Case 5: 🔵 Not Purchased
🔹 Test Case 6: 🟢 Purchased
🔹 Test Case 7: 🟢 Purchased
🔹 Test Case 8: 🔵 Not Purchased
Future_Prediction = [0]


