In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the data
data = pd.read_csv('C:/Lohith/projects/flare/dataset/extracted_data_clean.csv')

# Shuffle the data
data = shuffle(data, random_state=42)

# Remove rows with missing values
data = data.dropna()

# Print initial info about the dataset
print("Initial dataset info:")
print(data.info())
print("\nMissing values:")
print(data.isnull().sum())
print("\nUnique values in each column:")
print(data.nunique())

# Separate features and target
X = data.drop(['action', 'devname', 'devid', 'trandisp', 'appcat', 'ip_address', 'datetime'], axis=1)
y = data['action']

# Handle categorical features
cat_features = X.select_dtypes(include=['object']).columns
for col in cat_features:
    X[col] = X[col].astype('category')
    X[col] = X[col].cat.add_categories(['Unknown']).fillna('Unknown')

# Convert categorical variables to numeric using LabelEncoder
le = LabelEncoder()
for col in cat_features:
    X[col] = le.fit_transform(X[col])

# Handle numeric features
num_features = X.select_dtypes(include=['float64', 'int64']).columns
for col in num_features:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Initial dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 284798 entries, 1499 to 121958
Data columns (total 28 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   timestamp    284798 non-null  object 
 1   devname      284798 non-null  object 
 2   devid        284798 non-null  object 
 3   type         284798 non-null  object 
 4   subtype      284798 non-null  object 
 5   level        284798 non-null  object 
 6   srcip        284798 non-null  object 
 7   srcport      284798 non-null  float64
 8   srcintf      284798 non-null  object 
 9   srcintfrole  284798 non-null  object 
 10  dstip        284798 non-null  object 
 11  dstport      284798 non-null  float64
 12  dstintf      284798 non-null  object 
 13  dstintfrole  284798 non-null  object 
 14  proto        284798 non-null  float64
 15  policyid     284798 non-null  float64
 16  service      284798 non-null  object 
 17  dstcountry   284798 non-null  object 
 18  srcc

In [3]:
# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.586060393258427
Classification Report:
              precision    recall  f1-score   support

      accept       0.52      0.14      0.22      1622
  client-rst       0.57      0.83      0.68     27852
       close       0.61      0.44      0.51     19266
        deny       0.91      0.75      0.82      1993
  server-rst       0.42      0.00      0.01      4972
     timeout       0.01      0.00      0.01      1255

    accuracy                           0.59     56960
   macro avg       0.51      0.36      0.37     56960
weighted avg       0.57      0.59      0.54     56960

Confusion Matrix:
[[  229  1115   226    11    16    25]
 [  112 23174  4431    62     2    71]
 [   50 10518  8471    82     0   145]
 [    0   412     0  1490     0    91]
 [   52  4124   770     0    13    13]
 [    1  1237    12     0     0     5]]
