In [1]:
import pandas as pd

In [3]:
# STEP 1: Load your dataset
wazuh = pd.read_csv("generated_5000_logs.csv")

In [4]:
wazuh.columns

Index(['_source.data.url', '_source.rule.firedtimes', '_source.rule.level',
       'label'],
      dtype='object')

In [5]:
wazuh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   _source.data.url         5000 non-null   object
 1   _source.rule.firedtimes  5000 non-null   int64 
 2   _source.rule.level       5000 non-null   int64 
 3   label                    5000 non-null   object
dtypes: int64(2), object(2)
memory usage: 156.4+ KB


In [6]:
real_threat_df = wazuh[wazuh['label'] == 'real threat']

In [7]:
real_threat_df

Unnamed: 0,_source.data.url,_source.rule.firedtimes,_source.rule.level,label
1,"/search?q=' UNION SELECT username, password FR...",12,15,real threat
3,/search?q='; SELECT * FROM admin--,2,14,real threat
4,/search?q=%27+AND+1%3D2--,1,16,real threat
5,/search?q=' OR 1=1;#,13,15,real threat
6,/search?q=%27+AND+1%3D2--,1,12,real threat
...,...,...,...,...
4987,/search?q=' OR 1=1--,15,10,real threat
4992,/search?q=' OR pg_sleep(5)--,10,15,real threat
4994,/search?q=' OR 1=1/*,15,13,real threat
4995,"/search?q=' UNION ALL SELECT 1,2,3,4--",2,15,real threat


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [9]:
# Rename columns for simplicity
wazuh = wazuh.rename(columns={
    '_source.data.url': 'url',
    '_source.rule.firedtimes': 'firedtimes',
    '_source.rule.level': 'level',
    'label': 'label'
})

In [10]:
wazuh

Unnamed: 0,url,firedtimes,level,label
0,/search?q='%',5,6,false positive
1,"/search?q=' UNION SELECT username, password FR...",12,15,real threat
2,/search?q=admin,5,5,false positive
3,/search?q='; SELECT * FROM admin--,2,14,real threat
4,/search?q=%27+AND+1%3D2--,1,16,real threat
...,...,...,...,...
4995,"/search?q=' UNION ALL SELECT 1,2,3,4--",2,15,real threat
4996,/search?q=1',3,7,false positive
4997,/search?q=<script>alert(1)</script>,3,8,false positive
4998,/search?q='abc',4,7,false positive


In [11]:
# Encode the target label
wazuh['label'] = wazuh['label'].map({'real threat': 1, 'false positive': 0})

In [12]:
wazuh

Unnamed: 0,url,firedtimes,level,label
0,/search?q='%',5,6,0
1,"/search?q=' UNION SELECT username, password FR...",12,15,1
2,/search?q=admin,5,5,0
3,/search?q='; SELECT * FROM admin--,2,14,1
4,/search?q=%27+AND+1%3D2--,1,16,1
...,...,...,...,...
4995,"/search?q=' UNION ALL SELECT 1,2,3,4--",2,15,1
4996,/search?q=1',3,7,0
4997,/search?q=<script>alert(1)</script>,3,8,0
4998,/search?q='abc',4,7,0


In [13]:
# Feature + label split
X_text = wazuh['url']  # Text feature
X_meta = wazuh[['firedtimes', 'level']]  # Numeric features
y = wazuh['label']

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a column transformer for both text and numeric data
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'url'),
        ('num', StandardScaler(), ['firedtimes', 'level'])
    ]
)

In [15]:
pipeline = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=42))

In [16]:
# Train-test split
X = wazuh[['url', 'firedtimes', 'level']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Train model
pipeline.fit(X_train, y_train)

In [18]:
# Evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       400
           1       1.00      0.99      0.99       600

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



In [33]:
# Example input: a suspicious payload
sample = pd.DataFrame([{
    'url': "/search?q=' UNION SELECT null--",
    'firedtimes': 7,
    'level': 6
}])

In [34]:
# Predict using the trained pipeline
prediction = pipeline.predict(sample)

In [35]:
# Interpret the result
label = "real threat" if prediction[0] == 1 else "false positive"
print("Prediction:", label)

Prediction: real threat
