In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv('py_clean.csv')

# Preprocess the data
df['Query'] = df['Query'].str.strip()  # Remove leading/trailing whitespaces

# Split the data into features and labels
X = df['Query']
y = df['Label']

# Convert text data into numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train a classification model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

import joblib, os

model_dir = "models"

joblib.dump(vectorizer, os.path.join(model_dir, f'{"py"}_vectorizer.pkl'))
joblib.dump(model, os.path.join(model_dir, f'{"py"}_model.pkl'))

print("Model ve TF-IDF vektörizer kaydedildi.")


# Function to predict new code snippet
def predict_code_snippet(code_snippet):
    code_vectorized = vectorizer.transform([code_snippet])
    prediction = model.predict(code_vectorized)
    return "Malicious" if prediction[0] == 1 else "Safe"

# Test with a new input
# new_code = input("Enter a code snippet: ")
# print("The code is:", predict_code_snippet(new_code))


              precision    recall  f1-score   support

           0       0.96      0.97      0.97       137
           1       0.98      0.98      0.98       211

    accuracy                           0.97       348
   macro avg       0.97      0.97      0.97       348
weighted avg       0.97      0.97      0.97       348

Model ve TF-IDF vektörizer kaydedildi.
