In [1]:
import pandas as pd

df = pd.read_csv("sql.csv")
print(df.head())
print(f"Veri boyutu: {df.shape}")

                                               Query  Label
0                  " or pg_sleep  (  __TIME__  )  --      1
1  create user name identified by pass123 tempora...      1
2   AND 1  =  utl_inaddr.get_host_address   (    ...      1
3   select * from users where id  =  '1' or @ @1 ...      1
4   select * from users where id  =  1 or 1#"  ( ...      1
Veri boyutu: (22913, 2)


In [69]:
# veriyi temizleme işlemlerini burada yapıyoruz
print("Eksik değerler:", df.isnull().sum())
df = df.dropna()  
df = df[df["Query"].str.strip() != ""]  

print("Label dağılımı:", df["Label"].value_counts()) 


Eksik değerler: Query    0
Label    0
dtype: int64
Label dağılımı: Label
0    11584
1    11329
Name: count, dtype: int64


In [70]:
from sklearn.model_selection import train_test_split

X = df["Query"]  
y = df["Label"] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

print(f"Eğitim seti boyutu: {X_train.shape[0]}")
print(f"Test seti boyutu: {X_test.shape[0]}")


Eğitim seti boyutu: 18330
Test seti boyutu: 4583


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, lowercase=True ,ngram_range=(1, 2))
# max_features 5000 1000, ngram_range (1,1) (2,2)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF Eğitim seti boyutu: {X_train_tfidf.shape}")
print(f"TF-IDF Test seti boyutu: {X_test_tfidf.shape}")


TF-IDF Eğitim seti boyutu: (18330, 5000)
TF-IDF Test seti boyutu: (4583, 5000)


In [72]:
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=42,class_weight="balanced")
model.fit(X_train_tfidf, y_train)   

print("Model eğitimi tamamlandı.")

Model eğitimi tamamlandı.


In [73]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Tahmin
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred) 
print(f"Doğruluk: {accuracy:.2f}")
print("\nPerformans Raporu:")
print(classification_report(y_test, y_pred, target_names=["Benign", "Malicious"]))


Doğruluk: 0.96

Performans Raporu:
              precision    recall  f1-score   support

      Benign       0.93      0.99      0.96      2335
   Malicious       0.99      0.92      0.95      2248

    accuracy                           0.96      4583
   macro avg       0.96      0.95      0.95      4583
weighted avg       0.96      0.96      0.95      4583



In [74]:
import joblib, os

model_dir = "models"

joblib.dump(vectorizer, os.path.join(model_dir, f'{"sql"}_vectorizer.pkl'))
joblib.dump(model, os.path.join(model_dir, f'{"sql"}_model.pkl'))

print("Model ve TF-IDF vektörizer kaydedildi.")


Model ve TF-IDF vektörizer kaydedildi.


In [54]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=2
)

# Grid Search'i eğitiyoruz
grid_search.fit(X_train_tfidf, y_train)

print(f"En iyi parametreler: {grid_search.best_params_}")
print(f"En iyi F1 skoru: {grid_search.best_score_}")


Fitting 5 folds for each of 162 candidates, totalling 810 fits
En iyi parametreler: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
En iyi F1 skoru: 0.9949699458212258


In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=200, max_depth=20,random_state=42,min_samples_leaf= 1,min_samples_split= 10,class_weight="balanced")
model.fit(X_train_tfidf, y_train)

print("Model eğitimi tamamlandı.")


y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Doğruluk: {accuracy:.2f}")
print("\nPerformans Raporu:")
print(classification_report(y_test, y_pred, target_names=["Benign", "Malicious"]))


Model eğitimi tamamlandı.
Doğruluk: 0.99

Performans Raporu:
              precision    recall  f1-score   support

      Benign       0.99      1.00      0.99      2335
   Malicious       1.00      0.99      0.99      2248

    accuracy                           0.99      4583
   macro avg       0.99      0.99      0.99      4583
weighted avg       0.99      0.99      0.99      4583



In [None]:
import joblib, os

model_dir = "models"

joblib.dump(vectorizer, os.path.join(model_dir, f'{"sql"}_vectorizer.pkl'))
joblib.dump(model, os.path.join(model_dir, f'{"sql"}_model.pkl'))

print("Model ve TF-IDF vektörizer kaydedildi.")


In [75]:
new_queries = [
    # Zafiyetli (SQL Injection'a açık) komutlar:
    "select * from users where id  =  1 or ""]{"" or 1  =  1 -- 1",
    "select * from users where id  =  '1' or @ @1  =  1 union select 1,version  (    )   -- 1'",
    "1"" where 9241  =  9241 union all select null#", 
    "1  )   where 3738  =  3738 or 8421  =    (  select count  (  *  )   from generate_series  (  1,5000000   )    )   --",
    # Güvenli (SQL Injection'a karşı korumalı) komutlar:
    "cursor.execute('SELECT name, age FROM employees WHERE department_id = ? AND status = ?', (department_id, 'active'))", 
    "SELECT orders.id, customers.name FROM orders INNER JOIN customers ON orders.customer_id = customers.id WHERE customers.country = 'USA';",  
    "GRANT SELECT ON table_name TO 'username';",  
    "SELECT * FROM die FETCH FIRST 50 PERCENT ROWS ONLY",
]

new_queries_tfidf = vectorizer.transform(new_queries)
predictions = model.predict(new_queries_tfidf)
for query, prediction in zip(new_queries, predictions):
    result = "Zararlı" if prediction == 1 else "Güvenli"
    print(f"Sorgu: {query}\nTahmin edilen sınıf: {result}\n")


NameError: name 'vectorizer' is not defined

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Pipeline'ı tanımlayın
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),   # TF-IDF Vektörizer
    ('classifier', RandomForestClassifier())  # Sınıflandırıcı (placeholder)
])

# Parametre ızgarasını tanımlayın
param_grid = [
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [5, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__class_weight': ['balanced', 'balanced_subsample']
    },
    {
        'classifier': [LogisticRegression(max_iter=1000, random_state=42)],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__class_weight': ['balanced', None]
    },
    {
        'classifier': [GradientBoostingClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__max_depth': [3, 5, 7]
    },
    {
        'classifier': [SVC(kernel='linear', probability=True, random_state=42)],
        'classifier__C': [0.1, 1, 10],
        'classifier__class_weight': ['balanced', None]
    }
]

# GridSearchCV'yi tanımlayın
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=2
)

# Grid Search'i eğitin
grid_search.fit(X_train, y_train)

# En iyi parametreleri ve skoru yazdırın
print(f"En iyi parametreler: {grid_search.best_params_}")
print(f"En iyi F1 skoru: {grid_search.best_score_}")

# En iyi modeli al
best_model = grid_search.best_estimator_

# Test seti üzerinde tahmin yapın
y_pred = best_model.predict(X_test)

# Performans raporunu yazdırın
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 132 candidates, totalling 660 fits
En iyi parametreler: {'classifier': LogisticRegression(max_iter=1000, random_state=42), 'classifier__C': 10, 'classifier__class_weight': None, 'classifier__penalty': 'l2'}
En iyi F1 skoru: 0.9916896859625293
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2335
           1       1.00      0.99      0.99      2248

    accuracy                           0.99      4583
   macro avg       0.99      0.99      0.99      4583
weighted avg       0.99      0.99      0.99      4583



In [None]:
import joblib, os

model_dir = "models"

joblib.dump(vectorizer, os.path.join(model_dir, f'{"sql"}_vectorizer.pkl'))
joblib.dump(best_model, os.path.join(model_dir, f'{"sql"}_model.pkl'))

print("Model ve TF-IDF vektörizer kaydedildi.")
