**mount drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df = pd.read_parquet('/content/drive/MyDrive/AI_LAB/Assignment 1/NF-ToN-IoT-V2.parquet')

# View the first few rows
print(df.head())


Mounted at /content/drive
   L4_SRC_PORT  L4_DST_PORT  PROTOCOL  L7_PROTO  IN_BYTES  IN_PKTS  OUT_BYTES  \
0        49235         4444         6       0.0    155392      202      34552   
1        49228         1880         6       0.0      1600       40      35741   
2            0            0         1       0.0       212        2          0   
3        65317         1900        17       0.0       165        1          0   
4        60766        15600        17       0.0        63        1          0   

   OUT_PKTS  TCP_FLAGS  CLIENT_TCP_FLAGS  ...  TCP_WIN_MAX_IN  \
0       149         24                24  ...           45555   
1        65         24                16  ...           16425   
2         0          0                 0  ...               0   
3         0          0                 0  ...               0   
4         0          0                 0  ...               0   

   TCP_WIN_MAX_OUT  ICMP_TYPE  ICMP_IPV4_TYPE  DNS_QUERY_ID  DNS_QUERY_TYPE  \
0             480

In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load dataset
df = pd.read_parquet("/content/drive/MyDrive/AI_LAB/Assignment 1/NF-ToN-IoT-V2.parquet")

# Step 2: Separate features (X) and label (y)
X = df.drop(["Label", "Attack"], axis=1)   # Drop target columns
y = df["Attack"]                           # Using attack class

# Step 3: Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=43
)

# Step 5: Create KNN model
model = KNeighborsClassifier(n_neighbors=3)

# Step 6: Train model
model.fit(X_train, y_train)

# Step 7: Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Step 8: Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Step 9: Print accuracies
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy :", test_accuracy)

# Step 10: Check model fitting condition
diff = train_accuracy - test_accuracy

print("\nModel Evaluation:")

if train_accuracy >= 0.95 and diff > 0.10:
    print(" Model is OVERFITTING (High train, much lower test)")
elif train_accuracy < 0.70 and test_accuracy < 0.70:
    print(" Model is UNDERFITTING (Both accuracies low)")
elif abs(diff) <= 0.05:
    print("Model is GOOD FITTING (Train and test close)")
else:
    print(" Model needs tuning (Not clearly over/under fitting)")


**running subset**

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load full dataset
df = pd.read_parquet('/content/drive/MyDrive/AI_LAB/Assignment 1/NF-ToN-IoT-V2.parquet')

# Step 2: Create subset (50,000 rows)
df_subset = df.sample(50000, random_state=42)

# Step 3: Save subset
df_subset.to_parquet('/content/drive/MyDrive/AI_LAB/Assignment 1/NF-ToN-IoT-V2_50K.parquet')
print("Subset created & saved as NF-ToN-IoT-V2_50K.parquet")

# Step 4: Features & Target
X = df_subset.drop(["Label", "Attack"], axis=1)
y = df_subset["Attack"]

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Step 5: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

# Step 6: Create KNN model
model = KNeighborsClassifier(n_neighbors=3)

# Step 7: Train model
model.fit(X_train, y_train)

# Step 8: Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Step 9: Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy :", test_accuracy)

# Step 10: Check model fit (overfitting / underfitting / good fitting)
diff = train_accuracy - test_accuracy
print("\nModel Evaluation:")

if train_accuracy >= 0.95 and diff > 0.10:
    print(" Model is OVERFITTING — high train accuracy, lower test accuracy.")
elif train_accuracy < 0.70 and test_accuracy < 0.70:
    print(" Model is UNDERFITTING — model not learning well.")
elif abs(diff) <= 0.05:
    print("Model is GOOD FITTING — train & test scores similar.")
else:
    print("Model needs more tuning.")



Subset created & saved as NF-ToN-IoT-V2_50K.parquet
Training Accuracy: 0.945325
Testing Accuracy : 0.9131

Model Evaluation:
Model is GOOD FITTING — train & test scores similar.


**applied cleaning**

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 1: Load dataset
df = pd.read_parquet('/content/drive/MyDrive/AI_LAB/Assignment 1/NF-ToN-IoT-V2.parquet')

print("Cleaning dataset...")

# Store target columns first
labels = df[['Label', 'Attack']]

# Remove duplicates
df = df.drop_duplicates()

# Replace inf values with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Fill missing numeric values
df = df.fillna(df.mean(numeric_only=True))

# Keep only numeric columns
df = df.select_dtypes(include=['number'])

# Remove Label & Attack if present accidentally
df = df.drop(['Label', 'Attack'], axis=1, errors='ignore')

# Add targets back (reset index to align properly)
df = pd.concat([df.reset_index(drop=True), labels.reset_index(drop=True)], axis=1)

print("Cleaning Completed")

# Create subset (50k rows)
df_subset = df.sample(50000, random_state=42)
df_subset.to_parquet('/content/drive/MyDrive/AI_LAB/Assignment 1/NF-ToN-IoT-V2_50K.parquet')
print("Subset saved")

# Split features & target
X = df_subset.drop(["Label", "Attack"], axis=1)
y = df_subset["Attack"]

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

#  Extra step: remove NaN if any remain (prevents ValueError)
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)

# Train KNN
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)

# Predictions ( fixed)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy :", test_accuracy)

# Fit evaluation
diff = train_accuracy - test_accuracy
print("\nModel Evaluation:")

if train_accuracy >= 0.95 and diff > 0.10:
    print("OVERFITTING — high train accuracy, lower test accuracy")
elif train_accuracy < 0.70 and test_accuracy < 0.70:
    print("UNDERFITTING — low accuracy on both")
elif abs(diff) <= 0.05:
    print("GOOD FIT — train & test scores close")
else:
    print("Model needs tuning")


Cleaning dataset...
Cleaning Completed
Subset saved
Training Accuracy: 0.95875
Testing Accuracy : 0.9331

Model Evaluation:
GOOD FIT — train & test scores close


**changing hyperparameters**

In [13]:
from sklearn.neighbors import KNeighborsClassifier

# Updated KNN Model with tuned hyperparameters
model = KNeighborsClassifier(
    n_neighbors=7,        # Increased neighbors → smoother decision boundary
    weights='distance',  # Distance-based voting (better for uneven density)
    metric='minkowski',  # Default metric but best for mixed shapes
    p=2,                 # Euclidean distance (p=2)
    leaf_size=30,        # Optimized for memory/performance balance
    algorithm='auto'     # Auto chooses best algorithm (kd-tree, ball-tree)
)

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
