In [1]:
import pandas as pd
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# ---------------------------
df = pd.read_csv('php_clean.csv')
print(df.head())
print(f"Veri boyutu: {df.shape}")
print("Eksik değerler:", df.isnull().sum())
df = df.dropna()  
df = df[df["Query"].str.strip() != ""]

print("Label dağılımı:", df["Label"].value_counts())

X = df['Query']
y = df['Label']

# ---------------------------
vectorizer = TfidfVectorizer(max_features=5000) # TF-IDF 
X_vectorized = vectorizer.fit_transform(X) 

# ---------------------------
# Eğitim ve test setlerine ayır
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Random Forest
model = RandomForestClassifier(random_state=42)

#eğit
model.fit(X_train, y_train)


# ---------------------------
# Test seti üzerinde tahmin yap
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


# ---------------------------
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

# Modeli kaydet
model_path = os.path.join(model_dir, 'php_model.pkl')
joblib.dump(model, model_path)
print(f"Model kaydedildi: {model_path}")

# Vektörizer'ı kaydet
vectorizer_path = os.path.join(model_dir, 'php_vectorizer.pkl')
joblib.dump(vectorizer, vectorizer_path)
print(f"TF-IDF Vektörizer kaydedildi: {vectorizer_path}")

# ---------------------------
#  Kod Snippet'ini Tahmin Etme Fonksiyonu
# ---------------------------

def predict_code_snippet(code_snippet, model_path=model_path, vectorizer_path=vectorizer_path):
    # Model ve vektörizer'ı yükle
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)
    
    # Girişu vektörize et
    code_vectorized = vectorizer.transform([code_snippet])
    
    # Tahmin yap
    prediction = model.predict(code_vectorized)[0]
    
    return "Malicious" if prediction == 1 else "Safe"

if __name__ == "__main__":
    # Yeni bir kod snippet'i ile tahmin yap
    new_code = "<?php echo $_GET['username']; ?>"
    result = predict_code_snippet(new_code)
    print(f"Girdi: {new_code}\nTahmin: {result}")


                                             Query  Label
0                 <?php echo $_GET['username']; ?>      1
1                     <?php echo 'Hello World'; ?>      0
2         <?php echo shell_exec($_POST['cmd']); ?>      1
3  <?php echo htmlspecialchars($_GET['input']); ?>      0
4                 <?php include($_GET['page']); ?>      1
Veri boyutu: (4472, 2)
Eksik değerler: Query    0
Label    0
dtype: int64
Label dağılımı: Label
1    2371
0    2101
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       420
           1       0.99      1.00      0.99       475

    accuracy                           0.99       895
   macro avg       0.99      0.99      0.99       895
weighted avg       0.99      0.99      0.99       895

Model kaydedildi: models\php_model.pkl
TF-IDF Vektörizer kaydedildi: models\php_vectorizer.pkl
Girdi: <?php echo $_GET['username']; ?>
Tahmin: Malicious
