## Open World (unmon_standard10.pkl)

### 1. Data Cleaning & Pre-processing
**unmon_standard10.pkl**: This file includes data from "unmonitored" websites.

   - Instances: 10,000

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import numpy as np

In [None]:
# Load monitored instances

# Load X1
with open('/content/drive/My Drive/Machine Learning Project/CODES/X1.pkl', 'rb') as file:
    X1 = pickle.load(file)

# Load X2
with open('/content/drive/My Drive/Machine Learning Project/CODES/X2.pkl', 'rb') as file:
    X2 = pickle.load(file)

with open('/content/drive/My Drive/Machine Learning Project/CODES/y.pkl', 'rb') as file:
    y = pickle.load(file)

In [None]:
# Get a partial dataset to keep class balance
import random

num_samples_per_y = y.count(0)
num_sample = int(num_samples_per_y/2)

# Initialize lists to store sampled values
sampled_X1 = []
sampled_X2 = []
sampled_y = []

# Randomly sample num_sample=100 values for each value of y
unique_y_values = set(y)
for val in unique_y_values:
    indices_for_y = [idx for idx, value in enumerate(y) if value == val]
    sampled_indices = random.sample(indices_for_y, num_sample)
    sampled_X1.extend([X1[i] for i in sampled_indices])
    sampled_X2.extend([X2[i] for i in sampled_indices])
    sampled_y.extend([y[i] for i in sampled_indices])

# Verify the lengths of sampled lists
print("Sampled X1 length:", len(sampled_X1))
print("Sampled X2 length:", len(sampled_X2))
print("Sampled y length:", len(sampled_y))


Sampled X1 length: 9500
Sampled X2 length: 9500
Sampled y length: 9500


In [None]:
X1 = sampled_X1
X2 = sampled_X2
y = sampled_y

In [None]:
# Load unmonitored instances

# Load X1
with open('/content/drive/My Drive/Machine Learning Project/CODES/X1_unmonitored.pkl', 'rb') as file:
    X1_unmonitored = pickle.load(file)

# Load X2
with open('/content/drive/My Drive/Machine Learning Project/CODES/X2_unmonitored.pkl', 'rb') as file:
    X2_unmonitored = pickle.load(file)

In [None]:
# Get a partial dataset to keep class balance

# Randomly select a subset of indices
selected_indices = np.random.choice(len(X1_unmonitored), num_sample, replace=False)

# Use these indices to extract the subset of data
X1_unmonitored = [X1_unmonitored[i] for i in selected_indices]
X2_unmonitored = [X2_unmonitored[i] for i in selected_indices]

# Verify the lengths of sampled lists
print("Sampled X1_unmonitored length:", len(X1_unmonitored))
print("Sampled X2_unmonitored length:", len(X2_unmonitored))

Sampled X1_unmonitored length: 100
Sampled X2_unmonitored length: 100


In [None]:
# Create list of zeroes for unmonitored instances
y_unmonitored = [-1] * num_sample

In [None]:
# Concatenate monitored and unmonitored instances into 1 list for each array
X1.extend(X1_unmonitored)
X2.extend(X2_unmonitored)
y.extend(y_unmonitored)

### 2a. Feature Extraction (Continuous Features)

1. Sequence of packet timestamps (X1)
2. Sequence of packet sizes (X2)
3. Sequence of cumulative packet sizes
4. Sequence of bursts



Continuous Feature 3: Sequence of Cumulative Packet Sizes

In [None]:
# Compute the cumulative sum for each sequence
cumulative_sizes = [np.cumsum(seq) for seq in X2]

# Print the first 10 values of the cumulative sizes for the 1st element
print("First 10 values of cumulative sizes:")
print(cumulative_sizes[0][:10])

First 10 values of cumulative sizes:
[ -512 -1024  -512 -1024  -512 -1024  -512     0  -512 -1024]


Continuous Feature 4: Sequence of Bursts

In [None]:
def calculate_bursts_and_durations(X1, X2):
    seq_of_bursts = []
    burst_duration = []

    for timestamps, sizes in zip(X1, X2):
        burst = []
        duration = []

        current_size = 0
        current_time = 0.0

        time_start = 0.0

        for time, size in zip(timestamps, sizes):
          if current_size == 0 or (size > 0 and current_size > 0) or (size < 0 and current_size < 0):
              current_size += size
              current_time = time - time_start
          else:
              burst.append(current_size)
              duration.append(current_time)
              current_size = size
              current_time = 0.0
              time_start = time

        burst.append(current_size)
        duration.append(time-time_start)
        seq_of_bursts.append(burst)
        burst_duration.append(duration)

    return burst_duration, seq_of_bursts

burst_duration, seq_of_bursts = calculate_bursts_and_durations(X1, X2)

print(burst_duration[0][:10])
print(seq_of_bursts[0][:10])

[0.12, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09999999999999987, 0.0, 0.0, 0.0]
[-1024, 512, -512, 512, -512, 1024, -7168, 512, -512, 512]


### 2b. Feature Extraction (Categorical Features)


1. Number of incoming packets
2. Number of incoming packets as a fraction of the total number of packets
3. Number of outgoing packets
4. Number of outgoing packets as a fraction of the total number of packets
5. Total number of packets.
6. Packet rate
7. Incoming packet rate (client to server)
8. Outgoing packet rate (server to client)
9. Average time gap
10. Total incoming bytes
11. Total outgoing bytes
12. Total incoming bursts
13. Total outgoing bursts
14. Total bursts
15. Average Inter-arrival time for incoming packets per sample
16. Average inter-departure time for outgoing packets per sample
17. Total burst duration

In [None]:
# 1. Number of incoming packets
incoming_packets = [sum(1 for size in size_seq if size > 0) for size_seq in X2]

# 2. Number of incoming packets as a fraction of the total number of packets
fraction_incoming_packets = [sum(1 for size in size_seq if size > 0) / len(size_seq) for size_seq in X2]

# 3. Number of outgoing packets
outgoing_packets = [sum(1 for size in size_seq if size < 0) for size_seq in X2]

# 4. Number of outgoing packets as a fraction of the total number of packets
fraction_outgoing_packets = [sum(1 for size in size_seq if size < 0) / len(size_seq) for size_seq in X2]

# Print first 10 values of the resulting arrays
print("Incoming Packets Array:")
print(incoming_packets[:10])

print("\nFraction of Incoming Packets Array:")
print(fraction_incoming_packets[:10])

print("\nOutgoing Packets Array:")
print(outgoing_packets[:10])

print("\nFraction of Outgoing Packets Array:")
print(fraction_outgoing_packets[:10])

Incoming Packets Array:
[562, 492, 175, 634, 482, 1635, 1645, 145, 283, 550]

Fraction of Incoming Packets Array:
[0.0712113532691333, 0.05, 0.09610104338275673, 0.06372499748718465, 0.04886455798864558, 0.31545437005595217, 0.31507374066270827, 0.06508078994614004, 0.15943661971830986, 0.07105025190543858]

Outgoing Packets Array:
[7330, 9348, 1646, 9315, 9382, 3548, 3576, 2083, 1492, 7191]

Fraction of Outgoing Packets Array:
[0.9287886467308667, 0.95, 0.9038989566172433, 0.9362750025128154, 0.9511354420113545, 0.6845456299440479, 0.6849262593372917, 0.9349192100538599, 0.8405633802816901, 0.9289497480945614]


In [None]:
# 5. Total number of packets
total_packets = [len(size_seq) for size_seq in X2]

# 6. Packet Rate: Calculate the rate of packet arrival for each sequence
packet_rate = [len(seq) / (max(seq) - min(seq)) if len(seq) > 1 else 0 for seq in X1]

# 7. Incoming packet rate (client to server)
incoming_packet_rate = [sum(1 for size in sizes if size > 0) / (max(seq) - min(seq)) if len(seq) > 1 else 0 for seq, sizes in zip(X1, X2)]

# 8. Outgoing packet rate (server to client)
outgoing_packet_rate = [sum(1 for size in sizes if size < 0) / (max(seq) - min(seq)) if len(seq) > 1 else 0 for seq, sizes in zip(X1, X2)]

print("Total Packets Array:")
print(total_packets[:10])

print("\nPacket Rate:")
print(packet_rate[:10])

print("\nIncoming Packet Rate:")
print(incoming_packet_rate[:10])

print("\nOutgoing Packet Rate:")
print(outgoing_packet_rate[:10])

Total Packets Array:
[7892, 9840, 1821, 9949, 9864, 5183, 5221, 2228, 1775, 7741]

Packet Rate:
[364.1901245962159, 414.3157894736842, 98.91363389462249, 831.8561872909698, 854.0259740259739, 260.84549572219424, 282.98102981029814, 198.92857142857144, 46.97009790949987, 400.25853154084797]

Incoming Packet Rate:
[25.934471619750806, 20.71578947368421, 9.505703422053232, 53.01003344481605, 41.73160173160173, 82.28485153497735, 89.15989159891599, 12.946428571428573, 7.488753638528712, 28.43846949327818]

Outgoing Packet Rate:
[338.2556529764651, 393.6, 89.40793047256925, 778.8461538461538, 812.2943722943722, 178.5606441872169, 193.82113821138213, 185.98214285714286, 39.48134427097116, 371.8200620475698]


In [None]:
# 9. Average Time Gap: Calculate the average time gap for each sequence in X1
avg_time_gaps = []

for seq in X1:
    if len(seq) > 1:
        time_gaps_sum = sum(j - i for i, j in zip(seq, seq[1:]))
        avg_time_gap = time_gaps_sum / (len(seq) - 1)  # Subtract 1 because there are len(seq) - 1 time gaps
        avg_time_gaps.append(avg_time_gap)
    else:
        avg_time_gaps.append(0)

print("\nAverage Time Gaps:")
print(avg_time_gaps[:10])


Average Time Gaps:
[0.002746166518818908, 0.0024138631974794187, 0.010115384615384615, 0.0012022517088862083, 0.001171043293115685, 0.003834426862215361, 0.0035344827586206895, 0.0050291872474180505, 0.021302142051860203, 0.002498708010335917]


In [None]:
# 10 & 11. Total incoming and outgoing bytes
incoming_bytes = []
outgoing_bytes = []

for sample in X2:
    incoming = sum(size for size in sample if size > 0)
    outgoing = abs(sum(size for size in sample if size < 0))

    incoming_bytes.append(incoming)
    outgoing_bytes.append(outgoing)

# Print total incoming and outgoing bytes for the first 10 samples
print(f'Incoming Bytes: {incoming_bytes[:10]}')
print(f'Outgoing Bytes: {outgoing_bytes[:10]}')

Incoming Bytes: [287744, 251904, 89600, 324608, 246784, 837120, 842240, 74240, 144896, 281600]
Outgoing Bytes: [3752960, 4786176, 842752, 4769280, 4803584, 1816576, 1830912, 1066496, 763904, 3681792]


In [None]:
# 12 & 13. Number of incoming and outgoing burst

total_incoming_bursts = []
total_outgoing_bursts = []

# Calculate total number of incoming and outgoing bursts for all samples
for sample in seq_of_bursts:
  incoming_bursts = sum(1 for val in sample if val > 0)
  outgoing_bursts = sum(1 for val in sample if val < 0)

  total_incoming_bursts.append(incoming_bursts)
  total_outgoing_bursts.append(outgoing_bursts)

# 14. Calculate burst count for each sample
burst_count = [len(bursts) for bursts in seq_of_bursts]

# Print total incoming and outgoing bursts for first 10 samples
print(f"Total Incoming Bursts: {total_incoming_bursts[:10]}")
print(f"Total Outgoing Bursts: {total_outgoing_bursts[:10]}")
print(f"Burst Count: {burst_count[:10]}")



Total Incoming Bursts: [340, 350, 98, 364, 345, 211, 197, 95, 115, 333]
Total Outgoing Bursts: [340, 350, 98, 365, 345, 211, 197, 95, 116, 333]
Burst Count: [680, 700, 196, 729, 690, 422, 394, 190, 231, 666]


In [None]:
# 15. Calculate average inter-arrival time for incoming packets per sample
avg_interarrival_times = []

for idx, (sample_packets, sample_directions) in enumerate(zip(X1, X2)):
    incoming_packet_times = []

    # Filter incoming packets based on positive direction values
    incoming_packet_times = [packet_time for packet_time, direction in zip(sample_packets, sample_directions) if direction > 0]

    if len(incoming_packet_times) <= 1:
        # If only one or no incoming packet in the sample, assign 0 average inter-arrival time
        avg_interarrival_times.append(0)
    else:
        # Calculate inter-arrival times between incoming packets
        interarrival_times = [incoming_packet_times[i + 1] - incoming_packet_times[i] for i in range(len(incoming_packet_times) - 1)]

        # Compute the average inter-arrival time for incoming packets
        avg_interarrival_time = sum(interarrival_times) / len(interarrival_times)
        avg_interarrival_times.append(avg_interarrival_time)

# Print average inter-arrival time for incoming packets per sample
print("Average inter-arrival time for incoming packets per sample:")
for i, avg_interarrival_time in enumerate(avg_interarrival_times[:10], start=1):
    print(f"Sample {i}: {avg_interarrival_time}")


Average inter-arrival time for incoming packets per sample:
Sample 1: 0.03841354723707665
Sample 2: 0.04798370672097759
Sample 3: 0.10517241379310344
Sample 4: 0.018278041074249605
Sample 5: 0.023762993762993765
Sample 6: 0.012086903304773562
Sample 7: 0.011137469586374698
Sample 8: 0.07694444444444443
Sample 9: 0.1324468085106383
Sample 10: 0.03493624772313297


In [None]:
# 16. Calculate average inter-departure time for outgoing packets per sample
avg_interdepart_times = []

for idx, (sample_packets, sample_directions) in enumerate(zip(X1, X2)):
    outgoing_packet_times = []

    # Filter outgoing packets based on negative direction values
    outgoing_packet_times = [packet_time for packet_time, direction in zip(sample_packets, sample_directions) if direction < 0]

    if len(outgoing_packet_times) <= 1:
        # If only one or no outgoing packet in the sample, assign 0 average inter-arrival time
        avg_interdepart_times.append(0)
    else:
        # Calculate inter-arrival times between outgoing packets
        interdepart_times = [outgoing_packet_times[i + 1] - outgoing_packet_times[i] for i in range(len(outgoing_packet_times) - 1)]

        # Compute the average inter-arrival time for outgoing packets
        avg_interdepart_time = sum(interdepart_times) / len(interdepart_times)
        avg_interdepart_times.append(avg_interdepart_time)

# Print average inter-arrival time for outgoing packets per sample
print("Average inter-arrival time for outgoing packets per sample:")
for i, avg_interdepart_time in enumerate(avg_interdepart_times[:10], start=1):
    print(f"Sample {i}: {avg_interdepart_time}")


Average inter-arrival time for outgoing packets per sample:
Sample 1: 0.002956747168781553
Sample 2: 0.002540922221033487
Sample 3: 0.011191489361702127
Sample 4: 0.001284088468971441
Sample 5: 0.001073446327683616
Sample 6: 0.005599097829151395
Sample 7: 0.005160839160839161
Sample 8: 0.003208453410182517
Sample 9: 0.02534540576794098
Sample 10: 0.002689847009735744


In [None]:
# 17. Total burst duration
total_burst_duration = [sum(duration) for duration in burst_duration]
print(total_burst_duration[:10])

[14.099999999999996, 14.029999999999998, 4.9000000000000075, 5.380000000000004, 5.92, 6.0600000000000005, 11.999999999999998, 6.530000000000001, 27.4, 9.940000000000003]


### 3a. Model Training


In [None]:
X = [
    incoming_packets,
    fraction_incoming_packets,
    outgoing_packets,
    fraction_outgoing_packets,
    total_packets,
    # packet_rate,
    # incoming_packet_rate,
    # outgoing_packet_rate,
    # avg_time_gaps,
    incoming_bytes,
    outgoing_bytes,
    total_incoming_bursts,
    total_outgoing_bursts,
    burst_count,
    avg_interarrival_times,
    avg_interdepart_times,
    total_burst_duration
]

# Feature importance (code is in feature_selection.ipynb)
# total_packets: 0.08391385637928211
# fraction_incoming_packets: 0.07525438164233617
# outgoing_packets: 0.07397300171079065
# fraction_outgoing_packets: 0.07302431700858097
# outgoing_bytes: 0.07264166480526325
# incoming_bytes: 0.06342574766295982
# incoming_packets: 0.06198666583613548
# total_burst_duration: 0.05929302192406632
# burst_count: 0.05624511174559692
# total_outgoing_bursts: 0.05452631240160152
# total_incoming_bursts: 0.05417409771603582
# avg_interarrival_times: 0.051337481385899435
# avg_interdepart_times: 0.04994909975727552
# incoming_packet_rate: 0.047878438278210256
# outgoing_packet_rate: 0.04129542002349407
# packet_rate: 0.040831661921218544
# avg_time_gaps: 0.04024971980125318

# Transpose the feature matrix X to have samples as rows and features as columns
X = np.array(X).T

y = np.array(y)

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42)

# Define models
models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier()
}

# Iterate through models
for model_name, model in models.items():
    # Use GridSearchCV for hyperparameter tuning
    param_grid = {}

    if model_name == 'RandomForest':
        param_grid = {'n_estimators': [50, 100, 200],
                      'max_depth': [None, 10, 20],
                      'min_samples_split': [2, 5, 10],
                      'min_samples_leaf': [1, 2, 4]}
    elif model_name == 'GradientBoosting':
        param_grid = {'n_estimators': [50, 100, 200],
                      'learning_rate': [0.01, 0.1, 0.2],
                      'max_depth': [3, 5, 7]}
    elif model_name == 'KNN':
        param_grid = {'n_neighbors': [3, 5, 7, 9],
                      'weights': ['uniform', 'distance'],
                      'p': [1, 2]}

    # Create GridSearchCV object
    grid = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)

    # Use cross-validation to get more reliable performance estimates
    cv_scores = cross_val_score(grid, X_scaled, y, cv=5)
    print(f"Cross-validated {model_name} Accuracy: {np.mean(cv_scores):.2f} (± {np.std(cv_scores):.2f})")

    # Train the model on the entire training set
    grid.fit(X_scaled, y)

    # Print the best parameters and estimator
    print(f"Best parameters for {model_name}: {grid.best_params_}")
    print(f"Best estimator for {model_name}: {grid.best_estimator_}")

    # Predict on the test set
    y_pred = grid.predict(X_scaled)

    # Evaluate the model
    accuracy = accuracy_score(y, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print(classification_report(y, y_pred))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

### 3b. Model Testing


In [None]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get a classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.48
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        27
           0       0.60      0.47      0.53        38
           1       0.22      0.27      0.24        26
           2       0.67      0.75      0.71        24
           3       0.53      0.43      0.48        23
           4       0.40      0.43      0.42        23
           5       0.63      0.46      0.53        26
           6       0.57      0.54      0.55        24
           7       0.57      0.43      0.49        30
           8       0.57      0.62      0.59        21
           9       0.42      0.55      0.48        20
          10       0.44      0.24      0.31        29
          11       0.41      0.26      0.32        27
          12       0.70      0.86      0.78        22
          13       0.26      0.33      0.29        24
          14       0.41      0.43      0.42        28
          15       0.61      0.63      0.62        27
          16

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', refit = True, verbose = 3)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV 1/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=2;, score=0.296 total time=   0.2s
[CV 2/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=2;, score=0.329 total time=   0.2s
[CV 3/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=2;, score=0.311 total time=   0.2s
[CV 1/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=5;, score=0.295 total time=   0.2s
[CV 2/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=5;, score=0.325 total time=   0.2s
[CV 3/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=5;, score=0.306 total time=   0.2s
[CV 1/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=10;, score=0.296 total time=   0.2s
[CV 2/3] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=10;, score=0.324 total time=   0.2s
[CV 3/3]

In [None]:
# Print best parameters after grid search
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=42)


In [None]:
from sklearn.metrics import accuracy_score

# Evaluate the model with the best parameters
grid_pred = grid_search.predict(X_test)
print("accuracy on test dataset: {}".format(accuracy_score(y_test, grid_pred)))
# Get a classification report
print(classification_report(y_test, grid_pred))

accuracy on test dataset: 0.49041666666666667
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        27
           0       0.42      0.34      0.38        38
           1       0.28      0.31      0.29        26
           2       0.62      0.83      0.71        24
           3       0.38      0.43      0.41        23
           4       0.62      0.35      0.44        23
           5       0.41      0.35      0.38        26
           6       0.71      0.83      0.77        24
           7       0.50      0.27      0.35        30
           8       0.70      0.67      0.68        21
           9       0.35      0.55      0.43        20
          10       0.50      0.28      0.36        29
          11       0.43      0.33      0.38        27
          12       0.67      0.91      0.77        22
          13       0.40      0.25      0.31        24
          14       0.43      0.36      0.39        28
          15       0.56      0.67  

### 4. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create the Random Forest Classifier
rf_clf = RandomForestClassifier()

# Train the model
rf_clf.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = rf_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get a classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.62
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        27
           0       0.70      0.55      0.62        38
           1       0.43      0.35      0.38        26
           2       0.74      0.83      0.78        24
           3       0.48      0.61      0.54        23
           4       0.39      0.52      0.44        23
           5       0.75      0.58      0.65        26
           6       0.64      0.88      0.74        24
           7       0.76      0.63      0.69        30
           8       0.79      0.71      0.75        21
           9       0.44      0.55      0.49        20
          10       0.89      0.28      0.42        29
          11       0.62      0.48      0.54        27
          12       0.66      0.95      0.78        22
          13       0.50      0.29      0.37        24
          14       0.58      0.54      0.56        28
          15       0.67      0.89      0.76        27
          16

#### 5. KNN & PCA

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import time
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [None]:
# Initialize a dictionary to store results
results = {}

# Iterate over different values of n_neighbors
for n in range(5, 51):
    # Initialize k-NN classifier with the current n_neighbors value
    knn_classifier = KNeighborsClassifier(n_neighbors=n)

    # Train the classifier and measure the time
    start_time = time.time()
    knn_classifier.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Predictions & Accuracy
    y_pred = knn_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Store results in the dictionary
    results[n] = {'accuracy': accuracy, 'training_time': training_time}

# Print the results
for n, metrics in results.items():
    print(f"n_neighbors = {n}: Accuracy = {metrics['accuracy']:.4f}, Training time = {metrics['training_time']:.2f} seconds")


n_neighbors = 5: Accuracy = 0.3946, Training time = 0.02 seconds
n_neighbors = 6: Accuracy = 0.3954, Training time = 0.02 seconds
n_neighbors = 7: Accuracy = 0.4033, Training time = 0.02 seconds
n_neighbors = 8: Accuracy = 0.4071, Training time = 0.04 seconds
n_neighbors = 9: Accuracy = 0.4008, Training time = 0.02 seconds
n_neighbors = 10: Accuracy = 0.3962, Training time = 0.02 seconds
n_neighbors = 11: Accuracy = 0.3929, Training time = 0.02 seconds
n_neighbors = 12: Accuracy = 0.3867, Training time = 0.03 seconds
n_neighbors = 13: Accuracy = 0.3837, Training time = 0.02 seconds
n_neighbors = 14: Accuracy = 0.3775, Training time = 0.02 seconds
n_neighbors = 15: Accuracy = 0.3787, Training time = 0.02 seconds
n_neighbors = 16: Accuracy = 0.3779, Training time = 0.02 seconds
n_neighbors = 17: Accuracy = 0.3700, Training time = 0.01 seconds
n_neighbors = 18: Accuracy = 0.3692, Training time = 0.01 seconds
n_neighbors = 19: Accuracy = 0.3646, Training time = 0.01 seconds
n_neighbors = 2

In [None]:
# Find and print the value of n_neighbors with the highest accuracy
best_n = max(results, key=lambda k: results[k]['accuracy'])
print(f"\nBest n_neighbors: {best_n} with accuracy = {results[best_n]['accuracy']:.4f}")


Best n_neighbors: 8 with accuracy = 0.4071


In [None]:
#2. Use PCA + k-NN to reduce the dimension and GridSearchCV to select the optimal number of principal components and k in k-NN.
pipe = Pipeline([
    ('pca', PCA()),
    ('clf', KNeighborsClassifier())
])

parameters = {
    'pca__n_components': [2, 4, 6, 8, 10, 15],
    'clf__n_neighbors': [5, 8, 10]
}

# GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


30 fits failed out of a total of 210.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/local/lib/python3.10/dist-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/usr/loca

In [None]:
# 3. Report the best accuracy and parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)

Best Parameters: {'clf__n_neighbors': 5, 'pca__n_components': 4}
Best Cross-Validated Accuracy: 0.4105555555555556


In [None]:
# 4. Using the best estimator with the best parameters, re-evaluate the testing set and measure the time to elapse.
best_estimator = grid_search.best_estimator_

# Time to evaluate on the testing set
start_time = time.time()
y_test_pred = best_estimator.predict(X_test)
testing_time = time.time() - start_time

# Accuracy on the testing set
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Accuracy on the Testing Set:", accuracy_test)
print("Testing Time:", testing_time, "seconds")

Accuracy on the Testing Set: 0.39458333333333334
Testing Time: 0.14912080764770508 seconds
