In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score


In [2]:
df = pd.read_csv("../dataset/dataset.csv")

X = df["code"]
y = df["label"]


In [3]:
label_encoder = LabelEncoder()
y_encode = label_encoder.fit_transform(y)

num_classes= len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)

Classes: ['CommandInjection' 'Hardcoded' 'SQLi' 'Safe']


In [4]:
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1,2),
    token_pattern=r'\b\w+\b'
)

X_tfidf = vectorizer.fit_transform(X).toarray()

In [5]:
X_train , X_test , y_train , y_test = train_test_split(
    X_tfidf , y_encode,
    test_size=0.2,
    random_state=42,
    stratify=y_encode
)

In [6]:
from keras.src.legacy.backend import dtype
class CodeDataset(Dataset):
    def __init__(self , X ,y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y , dtype = torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self , idx):
        return self.X[idx] , self.y[idx]


In [7]:
train_dataset = CodeDataset(X_train, y_train)
test_dataset = CodeDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [8]:
class CodeClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CodeClassifier , self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self ,x):
        return self.network(x)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CodeClassifier(X_train.shape[1] , num_classes ).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
epochs = 30
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        inputs , labels = inputs.to(device) , labels.to(device)
        outputs = model(inputs)
        loss =criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/30, Loss: 1.4045
Epoch 2/30, Loss: 1.1789
Epoch 3/30, Loss: 0.9086
Epoch 4/30, Loss: 0.6847
Epoch 5/30, Loss: 0.5092
Epoch 6/30, Loss: 0.3302
Epoch 7/30, Loss: 0.2072
Epoch 8/30, Loss: 0.1328
Epoch 9/30, Loss: 0.0851
Epoch 10/30, Loss: 0.0545
Epoch 11/30, Loss: 0.0367
Epoch 12/30, Loss: 0.0366
Epoch 13/30, Loss: 0.0247
Epoch 14/30, Loss: 0.0238
Epoch 15/30, Loss: 0.0200
Epoch 16/30, Loss: 0.0103
Epoch 17/30, Loss: 0.0107
Epoch 18/30, Loss: 0.0043
Epoch 19/30, Loss: 0.0110
Epoch 20/30, Loss: 0.0031
Epoch 21/30, Loss: 0.0055
Epoch 22/30, Loss: 0.0056
Epoch 23/30, Loss: 0.0062
Epoch 24/30, Loss: 0.0029
Epoch 25/30, Loss: 0.0049
Epoch 26/30, Loss: 0.0010
Epoch 27/30, Loss: 0.0024
Epoch 28/30, Loss: 0.0031
Epoch 29/30, Loss: 0.0074
Epoch 30/30, Loss: 0.0050


In [11]:
model.eval()
all_preds = []
all_labels = []


with torch.no_grad():
    for inputs , labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())


from sklearn.metrics import accuracy_score, classification_report

print("Test Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:\n")
print(classification_report(all_labels, all_preds))

Test Accuracy: 0.5625

Classification Report:

              precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       1.00      1.00      1.00         4
           2       0.00      0.00      0.00         4
           3       0.20      0.25      0.22         4

    accuracy                           0.56        16
   macro avg       0.44      0.56      0.49        16
weighted avg       0.44      0.56      0.49        16



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


In [44]:
from jinja2.sandbox import MAX_RANGE
vectorizer = TfidfVectorizer(
    max_features=500,
    ngram_range=(1,2),
    token_pattern=r'\b\w+\b'
)
X_tfidf = vectorizer.fit_transform(df["code"])
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [45]:
def evaluate_model(model,name):
    model.fit(X_train,y_train)
    preds = model.predict(X_test)
    print(f"\n{name}")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

In [46]:
evaluate_model(LogisticRegression(max_iter=1000), "Logistic Regression")
evaluate_model(LinearSVC(), "Linear SVM")
evaluate_model(RandomForestClassifier(n_estimators=200), "Random Forest")
evaluate_model(MultinomialNB(), "Naive Bayes")



Logistic Regression
Accuracy: 0.625
                  precision    recall  f1-score   support

CommandInjection       0.57      1.00      0.73         4
       Hardcoded       1.00      1.00      1.00         4
            SQLi       1.00      0.25      0.40         4
            Safe       0.25      0.25      0.25         4

        accuracy                           0.62        16
       macro avg       0.71      0.62      0.59        16
    weighted avg       0.71      0.62      0.59        16


Linear SVM
Accuracy: 0.5625
                  precision    recall  f1-score   support

CommandInjection       0.57      1.00      0.73         4
       Hardcoded       1.00      1.00      1.00         4
            SQLi       0.00      0.00      0.00         4
            Safe       0.20      0.25      0.22         4

        accuracy                           0.56        16
       macro avg       0.44      0.56      0.49        16
    weighted avg       0.44      0.56      0.49        16



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Random Forest
Accuracy: 0.8125
                  precision    recall  f1-score   support

CommandInjection       0.80      1.00      0.89         4
       Hardcoded       0.80      1.00      0.89         4
            SQLi       1.00      0.75      0.86         4
            Safe       0.67      0.50      0.57         4

        accuracy                           0.81        16
       macro avg       0.82      0.81      0.80        16
    weighted avg       0.82      0.81      0.80        16


Naive Bayes
Accuracy: 0.75
                  precision    recall  f1-score   support

CommandInjection       0.57      1.00      0.73         4
       Hardcoded       1.00      1.00      1.00         4
            SQLi       1.00      0.75      0.86         4
            Safe       0.50      0.25      0.33         4

        accuracy                           0.75        16
       macro avg       0.77      0.75      0.73        16
    weighted avg       0.77      0.75      0.73        16



In [47]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

In [48]:
evaluate_model(rf, "Tuned Random Forest")


Tuned Random Forest
Accuracy: 0.8125
                  precision    recall  f1-score   support

CommandInjection       0.80      1.00      0.89         4
       Hardcoded       0.67      1.00      0.80         4
            SQLi       1.00      0.75      0.86         4
            Safe       1.00      0.50      0.67         4

        accuracy                           0.81        16
       macro avg       0.87      0.81      0.80        16
    weighted avg       0.87      0.81      0.80        16



In [49]:
rf = RandomForestClassifier(
    n_estimators=500,
    class_weight="balanced",
    random_state=42
)

evaluate_model(rf, "Balanced Random Forest")



Balanced Random Forest
Accuracy: 0.8125
                  precision    recall  f1-score   support

CommandInjection       0.80      1.00      0.89         4
       Hardcoded       0.67      1.00      0.80         4
            SQLi       1.00      0.75      0.86         4
            Safe       1.00      0.50      0.67         4

        accuracy                           0.81        16
       macro avg       0.87      0.81      0.80        16
    weighted avg       0.87      0.81      0.80        16



In [50]:
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(n_estimators=500, random_state=42)

scores = cross_val_score(rf, X_tfidf, y, cv=5)

print("Cross-val scores:", scores)
print("Mean accuracy:", scores.mean())


Cross-val scores: [0.625  0.8125 0.75   0.8125 0.8125]
Mean accuracy: 0.7625
