In [None]:
import pandas as pd

# Hatalı satırları atlayarak CSV dosyasını yükle
df = pd.read_csv('php.csv', on_bad_lines='skip')

# Boş stringleri temizle
df = df[df["Query"].str.strip() != ""]

# Temizlenmiş veri setini yeni bir dosyaya kaydet
df.to_csv('php_clean.csv', index=False)

print("Boş stringler temizlendi ve dosya 'php_clean.csv' olarak kaydedildi.")


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# Load the dataset
df = pd.read_csv('php_clean.csv')
print(df.head())
print(f"Veri boyutu: {df.shape}")


# Eksik değerleri kontrol et ve temizle
print("Eksik değerler:", df.isnull().sum())
df = df.dropna()  # NaN içeren satırları kaldır

# Boş stringler veya gereksiz veriler varsa kaldır
df = df[df["Query"].str.strip() != ""]  # Boş stringleri temizle

# Label sütununun yalnızca 0 ve 1 içerdiğini doğrula
print("Label dağılımı:", df["Label"].value_counts())


# Preprocess the data
df['Query'] = df['Query'].str.strip()  # Remove leading/trailing whitespaces

# Split the data into features and labels
X = df['Query']
y = df['Label']

# Convert text data into numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train a classification model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

import joblib, os

model_dir = "models"

joblib.dump(vectorizer, os.path.join(model_dir, f'{"php"}_vectorizer.pkl'))
joblib.dump(model, os.path.join(model_dir, f'{"php"}_model.pkl'))

print("Model ve TF-IDF vektörizer kaydedildi.")


# Function to predict new code snippet
def predict_code_snippet(code_snippet):
    code_vectorized = vectorizer.transform([code_snippet])
    prediction = model.predict(code_vectorized)
    return "Malicious" if prediction[0] == 1 else "Safe"

# Test with a new input
# new_code = input("Enter a code snippet: ")
# print("The code is:", predict_code_snippet(new_code))


                                             Query  Label
0                 <?php echo $_GET['username']; ?>      1
1                     <?php echo 'Hello World'; ?>      0
2         <?php echo shell_exec($_POST['cmd']); ?>      1
3  <?php echo htmlspecialchars($_GET['input']); ?>      0
4                 <?php include($_GET['page']); ?>      1
Veri boyutu: (4472, 2)
Eksik değerler: Query    0
Label    0
dtype: int64
Label dağılımı: Label
1    2371
0    2101
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       411
           1       0.99      0.99      0.99       484

    accuracy                           0.99       895
   macro avg       0.99      0.99      0.99       895
weighted avg       0.99      0.99      0.99       895

Model ve TF-IDF vektörizer kaydedildi.
