In [2]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load the data
data = pd.read_csv('C:/Lohith/projects/flare/dataset/extracted_data_clean.csv')

# Shuffle the data
data = shuffle(data, random_state=42)

# Remove rows with missing values
data = data.dropna()

# Print initial info about the dataset
print("Initial dataset info:")
print(data.info())
print("\nMissing values:")
print(data.isnull().sum())
print("\nUnique values in each column:")
print(data.nunique())

# Separate features and target
X = data.drop(['action', 'devname', 'devid', 'trandisp', 'appcat', 'ip_address', 'datetime'], axis=1)
y = data['action']

# Handle categorical features
cat_features = X.select_dtypes(include=['object']).columns
for col in cat_features:
    X[col] = X[col].astype('category')
    X[col] = X[col].cat.add_categories(['Unknown']).fillna('Unknown')

# Convert categorical variables to numeric using LabelEncoder
le = LabelEncoder()
for col in cat_features:
    X[col] = le.fit_transform(X[col])

# Handle numeric features
num_features = X.select_dtypes(include=['float64', 'int64']).columns
for col in num_features:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Standardize the numeric features for KNN
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Initial dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 284798 entries, 1499 to 121958
Data columns (total 28 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   timestamp    284798 non-null  object 
 1   devname      284798 non-null  object 
 2   devid        284798 non-null  object 
 3   type         284798 non-null  object 
 4   subtype      284798 non-null  object 
 5   level        284798 non-null  object 
 6   srcip        284798 non-null  object 
 7   srcport      284798 non-null  float64
 8   srcintf      284798 non-null  object 
 9   srcintfrole  284798 non-null  object 
 10  dstip        284798 non-null  object 
 11  dstport      284798 non-null  float64
 12  dstintf      284798 non-null  object 
 13  dstintfrole  284798 non-null  object 
 14  proto        284798 non-null  float64
 15  policyid     284798 non-null  float64
 16  service      284798 non-null  object 
 17  dstcountry   284798 non-null  object 
 18  srcc

In [4]:

# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 0.8653792134831461
Classification Report:
              precision    recall  f1-score   support

      accept       0.63      0.47      0.54      1622
  client-rst       0.86      0.93      0.89     27852
       close       0.89      0.84      0.87     19266
        deny       0.98      0.96      0.97      1993
  server-rst       0.82      0.69      0.75      4972
     timeout       0.84      0.88      0.86      1255

    accuracy                           0.87     56960
   macro avg       0.84      0.80      0.81     56960
weighted avg       0.86      0.87      0.86     56960

Confusion Matrix:
[[  767   444   262     1   147     1]
 [  155 25871  1329     8   348   141]
 [  128  2604 16218    23   240    53]
 [    1    29    33  1915    15     0]
 [  159  1039   332     5  3414    23]
 [    1   105    31     0    11  1107]]
