## Open World (unmon_standard10.pkl)

### 1. Data Cleaning & Pre-processing
**unmon_standard10.pkl**: This file includes data from "unmonitored" websites.

   - Instances: 10,000

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import numpy as np

In [None]:
# Load monitored instances

# Load X1
with open('/content/drive/My Drive/Machine Learning Project/CODES/X1.pkl', 'rb') as file:
    X1 = pickle.load(file)

# Load X2
with open('/content/drive/My Drive/Machine Learning Project/CODES/X2.pkl', 'rb') as file:
    X2 = pickle.load(file)

In [None]:
# Load unmonitored instances

# Load X1
with open('/content/drive/My Drive/Machine Learning Project/CODES/X1_unmonitored.pkl', 'rb') as file:
    X1_unmonitored = pickle.load(file)

# Load X2
with open('/content/drive/My Drive/Machine Learning Project/CODES/X2_unmonitored.pkl', 'rb') as file:
    X2_unmonitored = pickle.load(file)

In [None]:
# Get a partial dataset to keep class balance
num_samples_to_keep = int(len(X1_unmonitored)/2)

# Randomly select a subset of indices
selected_indices = np.random.choice(len(X1), num_samples_to_keep, replace=False)
selected_indices_unmonitored = np.random.choice(len(X1_unmonitored), num_samples_to_keep, replace=False)

# Use these indices to extract the subset of data
X1 = [X1[i] for i in selected_indices]
X2 = [X2[i] for i in selected_indices]
X1_unmonitored = [X1_unmonitored[i] for i in selected_indices_unmonitored]
X2_unmonitored = [X2_unmonitored[i] for i in selected_indices_unmonitored]

# Verify lengths of sampled data
print(f'Length of sampled X1: {len(X1)}')
print(f'Length of sampled X2: {len(X2)}\n')
print(f'Length of sampled X1_unmonitored: {len(X1_unmonitored)}')
print(f'Length of sampled X2_unmonitored: {len(X2_unmonitored)}')

Length of sampled X1: 5000
Length of sampled X2: 5000

Length of sampled X1_unmonitored: 5000
Length of sampled X2_unmonitored: 5000


In [None]:
# Create list of ones for monitored instances
y = [1] * num_samples_to_keep

# Create list of zeroes for unmonitored instances
y_unmonitored = [0] * num_samples_to_keep

In [None]:
# Concatenate monitored and unmonitored instances into 1 list for each array
X1.extend(X1_unmonitored)
X2.extend(X2_unmonitored)
y.extend(y_unmonitored)

In [None]:
del X1_unmonitored, X2_unmonitored, y_unmonitored

### 2a. Feature Extraction (Continuous Features)

1. Sequence of packet timestamps (X1)
2. Sequence of packet sizes (X2)
3. Sequence of cumulative packet sizes
4. Sequence of bursts



Continuous Feature 3: Sequence of Cumulative Packet Sizes

In [None]:
# Compute the cumulative sum for each sequence
cumulative_sizes = [np.cumsum(seq) for seq in X2]

# Print the first 10 values of the cumulative sizes for the 1st element
print("First 10 values of cumulative sizes:")
print(cumulative_sizes[0][:10])

First 10 values of cumulative sizes:
[ -512 -1024  -512 -1024  -512 -1024  -512     0  -512 -1024]


Continuous Feature 4: Sequence of Bursts

In [None]:
def calculate_bursts_and_durations(X1, X2):
    seq_of_bursts = []
    burst_duration = []

    for timestamps, sizes in zip(X1, X2):
        burst = []
        duration = []

        current_size = 0
        current_time = 0.0

        time_start = 0.0

        for time, size in zip(timestamps, sizes):
          if current_size == 0 or (size > 0 and current_size > 0) or (size < 0 and current_size < 0):
              current_size += size
              current_time = time - time_start
          else:
              burst.append(current_size)
              duration.append(current_time)
              current_size = size
              current_time = 0.0
              time_start = time

        burst.append(current_size)
        duration.append(time-time_start)
        seq_of_bursts.append(burst)
        burst_duration.append(duration)

    return burst_duration, seq_of_bursts

burst_duration, seq_of_bursts = calculate_bursts_and_durations(X1, X2)

print(burst_duration[0][:10])
print(seq_of_bursts[0][:10])

[0.11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08000000000000007, 0.0, 0.0, 0.0]
[-1024, 512, -512, 512, -512, 1024, -6144, 1024, -512, 512]


### 2b. Feature Extraction (Categorical Features)



1. Number of incoming packets
2. Number of incoming packets as a fraction of the total number of packets
3. Number of outgoing packets
4. Number of outgoing packets as a fraction of the total number of packets
5. Total number of packets.
6. Packet rate
7. Incoming packet rate (client to server)
8. Outgoing packet rate (server to client)
9. Average time gap
10. Total incoming bytes
11. Total outgoing bytes
12. Total incoming bursts
13. Total outgoing bursts
14. Total bursts
15. Average Inter-arrival time for incoming packets per sample
16. Average inter-departure time for outgoing packets per sample
17. Total burst duration



In [None]:
# 1. Number of incoming packets
incoming_packets = [sum(1 for size in size_seq if size > 0) for size_seq in X2]

# 2. Number of incoming packets as a fraction of the total number of packets
fraction_incoming_packets = [sum(1 for size in size_seq if size > 0) / len(size_seq) for size_seq in X2]

# 3. Number of outgoing packets
outgoing_packets = [sum(1 for size in size_seq if size < 0) for size_seq in X2]

# 4. Number of outgoing packets as a fraction of the total number of packets
fraction_outgoing_packets = [sum(1 for size in size_seq if size < 0) / len(size_seq) for size_seq in X2]

# Print first 10 values of the resulting arrays
print("Incoming Packets Array:")
print(incoming_packets[:10])

print("\nFraction of Incoming Packets Array:")
print(fraction_incoming_packets[:10])

print("\nOutgoing Packets Array:")
print(outgoing_packets[:10])

print("\nFraction of Outgoing Packets Array:")
print(fraction_outgoing_packets[:10])

Incoming Packets Array:
[305, 36, 566, 515, 28, 214, 25, 188, 242, 942]

Fraction of Incoming Packets Array:
[0.11054729974628488, 0.17647058823529413, 0.056776005617413985, 0.16835567178816607, 0.1917808219178082, 0.12707838479809977, 0.09881422924901186, 0.10118406889128095, 0.05485040797824116, 0.15191098209966133]

Outgoing Packets Array:
[2454, 168, 9403, 2544, 118, 1470, 228, 1670, 4170, 5259]

Fraction of Outgoing Packets Array:
[0.8894527002537151, 0.8235294117647058, 0.943223994382586, 0.831644328211834, 0.8082191780821918, 0.8729216152019003, 0.9011857707509882, 0.898815931108719, 0.9451495920217589, 0.8480890179003386]


In [None]:
# 5. Total number of packets
total_packets = [len(size_seq) for size_seq in X2]

# 6. Packet Rate: Calculate the rate of packet arrival for each sequence
packet_rate = [len(seq) / (max(seq) - min(seq)) if len(seq) > 1 else 0 for seq in X1]

# 7. Incoming packet rate (client to server)
incoming_packet_rate = [sum(1 for size in sizes if size > 0) / (max(seq) - min(seq)) if len(seq) > 1 else 0 for seq, sizes in zip(X1, X2)]

# 8. Outgoing packet rate (server to client)
outgoing_packet_rate = [sum(1 for size in sizes if size < 0) / (max(seq) - min(seq)) if len(seq) > 1 else 0 for seq, sizes in zip(X1, X2)]

print("Total Packets Array:")
print(total_packets[:10])

print("\nPacket Rate:")
print(packet_rate[:10])

print("\nIncoming Packet Rate:")
print(incoming_packet_rate[:10])

print("\nOutgoing Packet Rate:")
print(outgoing_packet_rate[:10])

Total Packets Array:
[2759, 204, 9969, 3059, 146, 1684, 253, 1858, 4412, 6201]

Packet Rate:
[229.53410981697172, 3.248924988055423, 1296.3589076723017, 106.32603406326034, 4.439039221647917, 150.2230151650312, 68.75, 32.56221521205748, 332.9811320754717, 108.39014158363922]

Incoming Packet Rate:
[25.374376039933445, 0.5733397037744864, 73.60208062418725, 17.900590893291625, 0.8513225904530253, 19.09009812667261, 6.7934782608695645, 3.2947774272695405, 18.264150943396228, 16.46565285789198]

Outgoing Packet Rate:
[204.15973377703827, 2.6755852842809364, 1222.7568270481145, 88.42544316996872, 3.587716631194892, 131.13291703835858, 61.95652173913043, 29.26743778478794, 314.7169811320755, 91.92448872574725]


In [None]:
# 9. Average Time Gap: Calculate the average time gap for each sequence in X1
avg_time_gaps = []

for seq in X1:
    if len(seq) > 1:
        time_gaps_sum = sum(j - i for i, j in zip(seq, seq[1:]))
        avg_time_gap = time_gaps_sum / (len(seq) - 1)  # Subtract 1 because there are len(seq) - 1 time gaps
        avg_time_gaps.append(avg_time_gap)
    else:
        avg_time_gaps.append(0)

print("\nAverage Time Gaps:")
print(avg_time_gaps[:10])


Average Time Gaps:
[0.004358230601885424, 0.3093103448275862, 0.0007714686998394864, 0.009408109875735775, 0.22682758620689655, 0.006660724896019014, 0.014603174603174604, 0.03072697899838449, 0.003003854001360236, 0.00922741935483871]


In [None]:
# 10. & 11. Total incoming and outgoing bytes
incoming_bytes = []
outgoing_bytes = []

for sample in X2:
    incoming = sum(size for size in sample if size > 0)
    outgoing = abs(sum(size for size in sample if size < 0))

    incoming_bytes.append(incoming)
    outgoing_bytes.append(outgoing)

# Print total incoming and outgoing bytes for the first 10 samples
print(f'Incoming Bytes: {incoming_bytes[:10]}')
print(f'Outgoing Bytes: {outgoing_bytes[:10]}')

Incoming Bytes: [156160, 18432, 289792, 263680, 14336, 109568, 12800, 96256, 123904, 482304]
Outgoing Bytes: [1256448, 86016, 4814336, 1302528, 60416, 752640, 116736, 855040, 2135040, 2692608]


In [None]:
# 12. & 13. Number of incoming and outgoing burst

total_incoming_bursts = []
total_outgoing_bursts = []

# Calculate total number of incoming and outgoing bursts for all samples
for sample in seq_of_bursts:
  incoming_bursts = sum(1 for val in sample if val > 0)
  outgoing_bursts = sum(1 for val in sample if val < 0)

  total_incoming_bursts.append(incoming_bursts)
  total_outgoing_bursts.append(outgoing_bursts)

# 14. Calculate burst count for each sample
burst_count = [len(bursts) for bursts in seq_of_bursts]

# Print total incoming and outgoing bursts for first 10 samples
print(f"Total Incoming Bursts: {total_incoming_bursts[:10]}")
print(f"Total Outgoing Bursts: {total_outgoing_bursts[:10]}")
print(f"Burst Count: {burst_count[:10]}")



Total Incoming Bursts: [144, 21, 339, 167, 12, 108, 15, 92, 164, 330]
Total Outgoing Bursts: [145, 22, 340, 168, 13, 108, 16, 92, 164, 330]
Burst Count: [289, 43, 679, 335, 25, 216, 31, 184, 328, 660]


In [None]:
# 15. Calculate average inter-arrival time for incoming packets per sample
avg_interarrival_times = []

for idx, (sample_packets, sample_directions) in enumerate(zip(X1, X2)):
    incoming_packet_times = []

    # Filter incoming packets based on positive direction values
    incoming_packet_times = [packet_time for packet_time, direction in zip(sample_packets, sample_directions) if direction > 0]

    if len(incoming_packet_times) <= 1:
        # If only one or no incoming packet in the sample, assign 0 average inter-arrival time
        avg_interarrival_times.append(0)
    else:
        # Calculate inter-arrival times between incoming packets
        interarrival_times = [incoming_packet_times[i + 1] - incoming_packet_times[i] for i in range(len(incoming_packet_times) - 1)]

        # Compute the average inter-arrival time for incoming packets
        avg_interarrival_time = sum(interarrival_times) / len(interarrival_times)
        avg_interarrival_times.append(avg_interarrival_time)

# Print average inter-arrival time for incoming packets per sample
print("Average inter-arrival time for incoming packets per sample:")
for i, avg_interarrival_time in enumerate(avg_interarrival_times[:10], start=1):
    print(f"Sample {i}: {avg_interarrival_time}")


Average inter-arrival time for incoming packets per sample:
Sample 1: 0.03888157894736842
Sample 2: 1.7794285714285714
Sample 3: 0.013433628318584072
Sample 4: 0.0407976653696498
Sample 5: 0.4822222222222222
Sample 6: 0.0515962441314554
Sample 7: 0.13583333333333333
Sample 8: 0.30406417112299466
Sample 9: 0.054315352697095434
Sample 10: 0.06038257173219979


In [None]:
# 16. Calculate average inter-departure time for outgoing packets per sample
avg_interdepart_times = []

for idx, (sample_packets, sample_directions) in enumerate(zip(X1, X2)):
    outcoming_packet_times = []

    # Filter outgoing packets based on negative direction values
    outcoming_packet_times = [packet_time for packet_time, direction in zip(sample_packets, sample_directions) if direction < 0]

    if len(outcoming_packet_times) <= 1:
        # If only one or no outgoing packet in the sample, assign 0 average inter-depart time
        avg_interdepart_times.append(0)
    else:
        # Calculate inter-departure times between outgoing packets
        interdepart_times = [outcoming_packet_times[i + 1] - outcoming_packet_times[i] for i in range(len(outcoming_packet_times) - 1)]

        # Compute the average inter-depart time for outgoing packets
        avg_interdepart_time = sum(interdepart_times) / len(interdepart_times)
        avg_interdepart_times.append(avg_interdepart_time)

# Print average inter-depart time for outgoing packets per sample
print("Average inter-departure time for outgoing packets per sample:")
for i, avg_interdepart_time in enumerate(avg_interdepart_times[:10], start=1):
    print(f"Sample {i}: {avg_interdepart_time}")


Average inter-departure time for outgoing packets per sample:
Sample 1: 0.004900122299225438
Sample 2: 0.3759880239520958
Sample 3: 0.0008179110827483514
Sample 4: 0.011313409359024773
Sample 5: 0.2811111111111111
Sample 6: 0.007562968005445881
Sample 7: 0.01621145374449339
Sample 8: 0.006411024565608148
Sample 9: 0.002067642120412569
Sample 10: 0.008818942563712438


In [None]:
# 17. Total burst duration
total_burst_duration = [sum(duration) for duration in burst_duration]
print(total_burst_duration[:10])

[3.1399999999999983, 1.0599999999999956, 2.600000000000005, 8.77, 1.1200000000000006, 3.749999999999998, 0.99, 50.339999999999996, 8.120000000000001, 28.189999999999998]


### 3a. Model Training


In [None]:
X = [
    incoming_packets,
    fraction_incoming_packets,
    outgoing_packets,
    fraction_outgoing_packets,
    total_packets,
    packet_rate,
    incoming_packet_rate,
    outgoing_packet_rate,
    avg_time_gaps,
    # incoming_bytes,
    # outgoing_bytes,
    # total_incoming_bursts,
    # total_outgoing_bursts,
    # burst_count,
    # avg_interarrival_times,
    # avg_interdepart_times,
    # total_burst_duration
]

# total_packets: 0.08391385637928211
# fraction_incoming_packets: 0.07525438164233617
# outgoing_packets: 0.07397300171079065
# fraction_outgoing_packets: 0.07302431700858097
# outgoing_bytes: 0.07264166480526325
# incoming_bytes: 0.06342574766295982
# incoming_packets: 0.06198666583613548
# total_burst_duration: 0.05929302192406632
# burst_count: 0.05624511174559692
# total_outgoing_bursts: 0.05452631240160152
# total_incoming_bursts: 0.05417409771603582
# avg_interarrival_times: 0.051337481385899435
# avg_interdepart_times: 0.04994909975727552
# incoming_packet_rate: 0.047878438278210256
# outgoing_packet_rate: 0.04129542002349407
# packet_rate: 0.040831661921218544
# avg_time_gaps: 0.04024971980125318

# Transpose the feature matrix X to have samples as rows and features as columns
X = np.array(X).T

y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create an SVM classifier
svm = SVC(kernel='rbf', random_state=42)

# Train the SVM model
svm.fit(X_train, y_train)

### 3b. Model Testing


In [None]:
# Predict on the test set
y_pred = svm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get a classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.58
              precision    recall  f1-score   support

           0       0.57      0.64      0.60      1228
           1       0.60      0.53      0.57      1272

    accuracy                           0.58      2500
   macro avg       0.59      0.59      0.58      2500
weighted avg       0.59      0.58      0.58      2500



In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter search space
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [0.1, 0.01, 0.001, 0.0001],
              # 'gamma': [0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# Fit the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.503 total time=   1.9s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.503 total time=   1.8s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.503 total time=   1.9s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.503 total time=   1.9s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.503 total time=   1.8s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.503 total time=   3.4s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.503 total time=   3.5s
[CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.505 total time=   1.9s
[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.506 total time=   1.9s
[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.506 total time=   1.9s
[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.527 total time=   2.0s
[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf

In [None]:
# Print best parameters after grid search
print(grid.best_params_)
print(grid.best_estimator_)

{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1, gamma=0.0001)


In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

# Evaluate the model with the best parameters
grid_pred = grid.predict(X_test)
print("accuracy on test dataset: {}".format(accuracy_score(y_test, grid_pred)))
print("recall on test dataset: {}".format(recall_score(y_test, grid_pred)))
print("precision on test dataset: {}".format(recall_score(y_test, grid_pred)))


accuracy on test dataset: 0.6336
recall on test dataset: 0.6352201257861635
precision on test dataset: 0.6352201257861635


In [None]:
svm_test = SVC(kernel='rbf', C=1, gamma=0.001, random_state=42)

svm_test.fit(X_train, y_train)

# Predict on the test set
y_pred_test = svm_test.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Accuracy: {accuracy:.2f}")

# Get a classification report
print(classification_report(y_test, y_pred_test))

Accuracy: 0.63
              precision    recall  f1-score   support

           0       0.61      0.71      0.66      1228
           1       0.67      0.55      0.60      1272

    accuracy                           0.63      2500
   macro avg       0.64      0.63      0.63      2500
weighted avg       0.64      0.63      0.63      2500



#### 4. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

In [None]:
# Model Testing
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.53
              precision    recall  f1-score   support

           0       0.51      0.60      0.55      1228
           1       0.54      0.46      0.50      1272

    accuracy                           0.53      2500
   macro avg       0.53      0.53      0.52      2500
weighted avg       0.53      0.53      0.52      2500



In [None]:
# Grid Search for Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000, 10000]}
grid = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5)
grid.fit(X_train, y_train)

In [None]:
best_params = grid.best_params_
print(best_params)

{'C': 0.1}


In [None]:
grid_pred = grid.predict(X_test)
print("accuracy on test dataset: {}".format(accuracy_score(y_test, grid_pred)))
print("recall on test dataset: {}".format(recall_score(y_test, grid_pred)))
print("precision on test dataset: {}".format(recall_score(y_test, grid_pred)))

accuracy on test dataset: 0.5256
recall on test dataset: 0.45754716981132076
precision on test dataset: 0.45754716981132076


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# New Logistic Regression Model with the best 'C' value we got from Grid Search
best_c_value = 1.0
logreg = LogisticRegression(C=best_c_value, random_state=42)

# Train the model on the training data
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get a classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.53
              precision    recall  f1-score   support

           0       0.51      0.60      0.55      1228
           1       0.54      0.46      0.50      1272

    accuracy                           0.53      2500
   macro avg       0.53      0.53      0.52      2500
weighted avg       0.53      0.53      0.52      2500



In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import re

results = {}

# Assign labels '1' for monitored instances and '-1' for unmonitored instances
y_binary = y #[1] * num_samples_to_keep + [-1] * num_samples_to_keep

# Feature Scaling
scaler = StandardScaler()
X_scaled_binary = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X_scaled_binary, y_binary, test_size=0.25, random_state=42)

# Define models
models = {
    'SVM': SVC(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression' : LogisticRegression()
}

# Iterate through models
for model_name, model in models.items():
    # Use GridSearchCV for hyperparameter tuning
    param_grid = {}

    if model_name == 'SVM':
        param_grid = {'C': [0.1, 1, 10, 100],
                      'gamma': [0.001, 0.01, 0.1, 1],
                      'kernel': ['rbf']}
    elif model_name == 'RandomForest':
        param_grid = {'n_estimators': [50, 100, 200],
                      'max_depth': [None, 10, 20],
                      'min_samples_split': [2, 5, 10],
                      'min_samples_leaf': [1, 2, 4]}
    elif model_name == 'GradientBoosting':
        param_grid = {'n_estimators': [50, 100, 200],
                      'learning_rate': [0.01, 0.1, 0.2],
                      'max_depth': [3, 5, 7]}
    elif model_name == 'LogisticRegression':
        param_grid = {'C': [0.1, 1, 10, 100],
                      'penalty': ['l2']}

    # Create GridSearchCV object
    grid = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)

    # Use cross-validation to get more reliable performance estimates
    cv_scores = cross_val_score(grid, X_train_binary, y_train_binary, cv=5)
    print(f"Cross-validated {model_name} Accuracy: {np.mean(cv_scores):.2f} (± {np.std(cv_scores):.2f})")

    # Train the model on the entire training set
    grid.fit(X_train_binary, y_train_binary)

    # Print the best parameters and estimator
    print(f"Best parameters for {model_name}: {grid.best_params_}")
    print(f"Best estimator for {model_name}: {grid.best_estimator_}")

    # Predict on the test set
    y_pred = grid.predict(X_test_binary)

    # Evaluate the model
    accuracy = accuracy_score(y_test_binary, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print(classification_report(y_test_binary, y_pred))

    # Generate the classification report
    class_report = classification_report(y_test_binary, y_pred)

    # Parse the classification report and extract relevant metrics
    lines = class_report.split('\n')
    classes = [re.search(r'\s+(-?\d+)\s+', line).group(1) for line in lines if re.search(r'\s+(-?\d+)\s+', line)]
    precision, recall, f1_score, support = [], [], [], []

    for line in lines[2:-5]:
        values = re.findall(r'\d+\.\d+|\d+', line)
        if len(values) >= 4:
            precision.append(float(values[1]))
            recall.append(float(values[2]))
            f1_score.append(float(values[3]))
            support.append(int(values[4]))

    # Store results in the dictionary
    results[model_name] = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'overall_accuracy': accuracy,
        'classification_report': class_report
    }

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Cross-validated SVM Accuracy: 0.63 (± 0.01)
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for SVM: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Best estimator for SVM: SVC(C=10, gamma=1)
SVM Accuracy: 0.63
              precision    recall  f1-score   support

           0       0.62      0.63      0.62      1228
           1       0.64      0.62      0.63      1272

    accuracy                           0.63      2500
   macro avg       0.63      0.63      0.63      2500
weighted avg       0.63      0.63      0.63      2500

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 fol

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report
import re

# Visualization code outside the loop
for model_name, result in results.items():
    precision = result['precision']
    recall = result['recall']
    f1_score = result['f1_score']
    overall_accuracy = result['overall_accuracy']
    classification_report_str = result['classification_report']

    # Visualization code for each model
    lines = classification_report_str.split('\n')
    classes = [re.search(r'\s+(-?\d+)\s+', line).group(1) for line in lines if re.search(r'\s+(-?\d+)\s+', line)]

    fig, ax = plt.subplots(figsize=(10, 6))
    bar_width = 0.2
    index = np.arange(len(classes))

    bar1 = ax.bar(index, precision, bar_width, label='Precision')
    bar2 = ax.bar(index + bar_width, recall, bar_width, label='Recall')
    bar3 = ax.bar(index + 2 * bar_width, f1_score, bar_width, label='F1-Score')

    ax.axhline(y=overall_accuracy, color='r', linestyle='--', label='Overall Accuracy')

    middle_position = index[-1] - 1.5 * bar_width
    ax.text(middle_position, overall_accuracy + 0.02, f'Accuracy: {overall_accuracy:.2f}', ha='center', va='bottom', color='r')

    ax.set_xlabel('Classes')
    ax.set_ylabel('Scores')
    ax.set_title(f'Classification Metrics - {model_name}')
    ax.set_xticks(index + bar_width)
    ax.set_xticklabels(classes)
    ax.legend()

    # Add values on top of the bars
    for bars in [bar1, bar2, bar3]:
        for bar, value in zip(bars, precision + recall + f1_score):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width() / 2, height, f'{value:.2f}', ha='center', va='bottom')

    plt.show()