In [1]:
import pandas as pd
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# ---------------------------
df = pd.read_csv('py_clean.csv')
print(df.head())
print(f"Veri boyutu: {df.shape}")
print("Eksik değerler:", df.isnull().sum())
df = df.dropna()  # NaN içeren satırları kaldır
df = df[df["Query"].str.strip() != ""] # Boş stringleri temizle

#dağılım
print("Label dağılımı:", df["Label"].value_counts())

# etiketleri ayır
X = df['Query']
y = df['Label']

# ---------------------------
vectorizer = TfidfVectorizer(max_features=5000) # TF-IDF 
X_vectorized = vectorizer.fit_transform(X) 

# ---------------------------
# Eğitim ve test setlerine ayır
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Random Forest
model = RandomForestClassifier(random_state=42)

#eğit
model.fit(X_train, y_train)


# ---------------------------
# Test seti üzerinde tahmin yap
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


# ---------------------------
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

# Modeli kaydet
model_path = os.path.join(model_dir, 'py_model.pkl')
joblib.dump(model, model_path)
print(f"Model kaydedildi: {model_path}")

# Vektörizer'ı kaydet
vectorizer_path = os.path.join(model_dir, 'py_vectorizer.pkl')
joblib.dump(vectorizer, vectorizer_path)
print(f"TF-IDF Vektörizer kaydedildi: {vectorizer_path}")

# ---------------------------
# 7. Yeni Kod Snippet'ini Tahmin Etme Fonksiyonu
# ---------------------------

def predict_code_snippet(code_snippet, model_path=model_path, vectorizer_path=vectorizer_path):
    # Model ve vektörizer'ı yükle
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)
    
    # Girişu vektörize et
    code_vectorized = vectorizer.transform([code_snippet])
    
    # Tahmin yap
    prediction = model.predict(code_vectorized)[0]
    
    return "Malicious" if prediction == 1 else "Safe"

if __name__ == "__main__":
    # Yeni bir kod snippet'i ile tahmin yap
    new_code = "import xml.etree.ElementTree as ETnxml_data = input(""XML girin: "")ntree = ET.fromstring(xml_data)nprint(tree.tag)"
    result = predict_code_snippet(new_code)
    print(f"Girdi: {new_code}\nTahmin: {result}")


                                               Query  Label
0  "import osnos.system(f"rm -rf {input('Silinece...      1
1  "user_input = input("Mesaj: ")nprint(user_inpu...      1
2  "import sqlite3nconn = sqlite3.connect('verita...      1
3  "user_input = input("Kod girin: ")neval(user_i...      1
4  "import picklendata = input("Pickle verisi gir...      1
Veri boyutu: (1739, 2)
Eksik değerler: Query    0
Label    0
dtype: int64
Label dağılımı: Label
1    1028
0     711
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       142
           1       0.97      0.99      0.98       206

    accuracy                           0.97       348
   macro avg       0.97      0.97      0.97       348
weighted avg       0.97      0.97      0.97       348

Model kaydedildi: models\py_model.pkl
TF-IDF Vektörizer kaydedildi: models\py_vectorizer.pkl
Girdi: import xml.etree.ElementTree as ETnxml_data = input(XML girin: )ntree =