In [None]:
# Temel kütüphaneler
import numpy as np
import pandas as pd

# Görselleştirme kütüphaneleri
import matplotlib.pyplot as plt
import seaborn as sns

# Veri işleme ve ölçekleme
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

# Model değerlendirme ve bölme
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

# Makine öğrenmesi modelleri
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import xgboost as xgb

# TensorFlow ve Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Ekstra kütüphaneler
import re

In [None]:
original_df = pd.read_csv("/content/sample_data/Obfuscated-MalMem2022.csv")
original_df.shape

In [None]:
print("Veri Seti Bilgileri:")
print(original_df.info())

In [None]:
print("\nVeri Seti Tanımlayıcı İstatistikleri:")
print(original_df.describe())

In [None]:
print("\nİlk Beş Kayıt:")
print(original_df.head())

In [None]:
def find_category(column):
    if "-" in column:
        category = column.split("-")[0]
        return category
    else:
        return column

In [None]:
def find_category_name(column):
    if "-" in column:
        category_name = column.split("-")[1]
        return category_name
    else:
        return column

In [None]:
df = original_df.copy()
df.shape

In [None]:
# Check if all values in each column are identical
for column in df.columns:
    if df[column].nunique() == 1:
        print(f"All values in {column} are identical.")

In [None]:
# drop identical columns
# List of columns to drop
columns_to_drop = ['pslist.nprocs64bit', 'handles.nport', 'svcscan.interactive_process_services']

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
missing_values_by_column = df.isnull().sum()

if missing_values_by_column.sum() == 0:
    print("\nThere is no missing values in dataset")
else:
    print("\nNumber of missing values by Column:")
    print(missing_values_by_column)

In [None]:
# Print the number of duplicate rows
print("\nNumber of Duplicate Rows:", df.duplicated().sum())

# Removing Duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Data Type Conversion
df["Class"] = df["Class"].astype("category")

# Handling Categorical Data (One-Hot Encoding)
df = pd.get_dummies(df, columns=["Class"], drop_first=True)

In [None]:
df['category_name'] = 'Unknown'
df['subcategory_name'] = 'Unknown'
df['category_name'] = df['Category'].apply(find_category)
df['subcategory_name'] = df['Category'].apply(find_category_name)

In [None]:
df["category_name"].value_counts()

In [None]:
sns.countplot(x=df["category_name"])

In [None]:
df["category_name"].value_counts().plot(kind="pie", autopct="%.2f%%")


In [None]:
df["subcategory_name"].value_counts()

In [None]:
plt.figure()
sns.countplot(x=df["subcategory_name"])
plt.xticks(rotation=90)
plt.show()

In [None]:
df["subcategory_name"].value_counts().plot(kind="pie", autopct="%.2f%%")

In [None]:
CE_df = df.copy(deep=True)

In [None]:
# Ortalama iş parçacığı ve işleyici sayısını belirleyin
CE_df['avg_threads_handlers'] = CE_df['pslist.avg_threads'] + CE_df['pslist.avg_handlers']

In [None]:
# Toplam handle sayısını belirleyin
CE_df['total_handles'] = CE_df[
    [#'handles.nport',#
     'handles.nfile', 'handles.nevent', 'handles.ndesktop', 'handles.nkey',
     'handles.nthread', 'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
     'handles.nsection', 'handles.nmutant']
].sum(axis=1)

In [None]:
# Ortalama yüklenmeyen modülleri belirleyin
CE_df['average_not_loaded_modules'] = CE_df[
    ['ldrmodules.not_in_load', 'ldrmodules.not_in_init', 'ldrmodules.not_in_mem']
].mean(axis=1)

In [None]:
# Görünmeyen işlemlerin toplamını belirleyin
CE_df['total_hidden_processes'] = CE_df[
    ['psxview.not_in_pslist', 'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
     'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles', 'psxview.not_in_session',
     'psxview.not_in_deskthrd']
].sum(axis=1)

In [None]:
# Hesaplamalarda kullanılan kolonları çıkarın
columns_to_drop = [
    'pslist.nproc', 'pslist.avg_threads', 'pslist.avg_handlers',
    'handles.nfile','handles.nevent', 'handles.ndesktop', 'handles.nkey',
    'handles.nthread', 'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
    'handles.nsection', 'handles.nmutant',
    'ldrmodules.not_in_load', 'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
    'psxview.not_in_pslist', 'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
    'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles', 'psxview.not_in_session',
    'psxview.not_in_deskthrd'
]
#'handles.nport','pslist.nprocs64bit',
CE_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
CE_df.info()

In [None]:
def split_data(X, y):
    y_mapped = y.map({x: i for i, x in enumerate(y.unique())})
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y_mapped, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=1)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def print_classification_report(model_name, val_accuracy, val_report, test_accuracy, test_report):
    print(f"{model_name} - Validation Accuracy: {val_accuracy:.4f}")
    print(f"{model_name} - Validation Precision: {val_report['weighted avg']['precision']:.4f}")
    print(f"{model_name} - Validation Recall: {val_report['weighted avg']['recall']:.4f}")
    print(f"{model_name} - Validation F1 Score: {val_report['weighted avg']['f1-score']:.4f}")

    print(f"{model_name} - Test Accuracy: {test_accuracy:.4f}")
    print(f"{model_name} - Test Precision: {test_report['weighted avg']['precision']:.4f}")
    print(f"{model_name} - Test Recall: {test_report['weighted avg']['recall']:.4f}")
    print(f"{model_name} - Test F1 Score: {test_report['weighted avg']['f1-score']:.4f}")


In [None]:
def plot_confusion_matrices(val_conf_matrix, test_conf_matrix, model_name, labels):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.heatmap(val_conf_matrix, annot=True, cmap='Blues', fmt='g', xticklabels=labels, yticklabels=labels)
    plt.title(f'{model_name} - Validation Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

    plt.subplot(1, 2, 2)
    sns.heatmap(test_conf_matrix, annot=True, cmap='Blues', fmt='g', xticklabels=labels, yticklabels=labels)
    plt.title(f'{model_name} - Test Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

    plt.tight_layout()
    plt.show()

In [None]:
def calculate_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    return precision, recall, f1, accuracy

In [None]:
def print_model_performance(model_name, val_metrics, test_metrics):
    val_precision, val_recall, val_f1, val_accuracy = val_metrics
    test_precision, test_recall, test_f1, test_accuracy = test_metrics

    print(f"\n{model_name} Validation Performance:")
    print(f"Accuracy: {val_accuracy:.4f}")
    print(f"Precision: {val_precision:.4f}")
    print(f"Recall: {val_recall:.4f}")
    print(f"F1 Score: {val_f1:.4f}")

    print(f"\n{model_name} Test Performance:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1:.4f}")

In [None]:
X = CE_df.copy(deep=True)
y = X['Class_Malware']
X = CE_df.drop(columns=["Category", "Class_Malware", "category_name", "subcategory_name"])

In [None]:
def logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    val_metrics = calculate_metrics(y_val, y_val_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    print_model_performance('Logistic Regression', val_metrics, test_metrics)
    plot_confusion_matrices(val_conf_matrix, test_conf_matrix, 'Logistic Regression', model.classes_)

In [None]:
def svm_model(X_train, X_val, X_test, y_train, y_val, y_test):
    model = SVC(kernel='linear', random_state=42)
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    val_metrics = calculate_metrics(y_val, y_val_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    print_model_performance('SVM', val_metrics, test_metrics)
    plot_confusion_matrices(val_conf_matrix, test_conf_matrix, 'SVM', model.classes_)

In [None]:
def random_forest_model(X_train, X_val, X_test, y_train, y_val, y_test):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    val_metrics = calculate_metrics(y_val, y_val_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    print_model_performance('Random Forest', val_metrics, test_metrics)
    plot_confusion_matrices(val_conf_matrix, test_conf_matrix, 'Random Forest', model.classes_)

In [None]:
def gradient_boosting_model(X_train, X_val, X_test, y_train, y_val, y_test):
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    val_metrics = calculate_metrics(y_val, y_val_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    print_model_performance('Gradient Boosting', val_metrics, test_metrics)
    plot_confusion_matrices(val_conf_matrix, test_conf_matrix, 'Gradient Boosting', model.classes_)

In [None]:
def dnn_model(X_train, X_val, X_test, y_train, y_val, y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    model = Sequential()
    model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_data=(X_val_scaled, y_val), callbacks=[early_stopping])

    y_val_pred = (model.predict(X_val_scaled) > 0.5).astype("int32")
    y_test_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")

    val_metrics = calculate_metrics(y_val, y_val_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    print_model_performance('Deep Neural Network', val_metrics, test_metrics)
    plot_confusion_matrices(val_conf_matrix, test_conf_matrix, 'Deep Neural Network', ["Benign", "Malicious"])

    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()

In [None]:
 X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

In [None]:
# Logistic Regression modeli
logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
# SVM modeli
svm_model(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
# Random Forest modeli
random_forest_model(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
# Gradient Boosting modeli
gradient_boosting_model(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
# DNN modeli
dnn_model(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
X = CE_df.copy(deep=True)
y = X['category_name']
X = CE_df.drop(columns=["Category", "Class_Malware", "category_name", "subcategory_name"])

In [None]:
def cross_validation_xgb(X_train, X_val, X_test, y_train, y_val, y_test):
    param_grid = {
        'objective': ['multi:softmax'],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 8],
        'tree_method': ['hist']
    }

    clf = xgb.XGBClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_

    y_val_pred = clf.predict(X_val)
    y_test_pred = clf.predict(X_test)

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    feature_importances = clf.feature_importances_
    importance_df = pd.DataFrame({
        'feature': X_train.columns,
        'importance': feature_importances
    })
    importance_df = importance_df.sort_values(by='importance', ascending=False).head(20)

    return clf, y_val_pred, y_test_pred, val_accuracy, val_report, test_accuracy, test_report, importance_df

In [None]:
def cross_validation_dt(X_train, X_val, X_test, y_train, y_val, y_test, important_features):
    X_train = X_train[important_features]
    X_val = X_val[important_features]
    X_test = X_test[important_features]

    param_grid = {
        'criterion': ['gini', 'log_loss'],
        'splitter': ['best', 'random'],
        'min_samples_leaf': [100, 200, 300],
        'max_depth': [3, 5, 8]
    }

    clf = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_

    y_val_pred = clf.predict(X_val)
    y_test_pred = clf.predict(X_test)

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    return clf, y_val_pred, val_accuracy, val_report, val_conf_matrix, y_test_pred, test_accuracy, test_report, test_conf_matrix

In [None]:
def cross_validation_knn(X_train, X_val, X_test, y_train, y_val, y_test, important_features):
    X_train = X_train[important_features]
    X_val = X_val[important_features]
    X_test = X_test[important_features]

    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }

    clf = KNeighborsClassifier()
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_

    y_val_pred = clf.predict(X_val)
    y_test_pred = clf.predict(X_test)

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    return clf, y_val_pred, val_accuracy, val_report, val_conf_matrix, y_test_pred, test_accuracy, test_report, test_conf_matrix


In [None]:
def cross_validation_rf(X_train, X_val, X_test, y_train, y_val, y_test, important_features):
    X_train = X_train[important_features]
    X_val = X_val[important_features]
    X_test = X_test[important_features]

    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 8, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    clf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_

    y_val_pred = clf.predict(X_val)
    y_test_pred = clf.predict(X_test)

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)
    val_conf_matrix = confusion_matrix(y_val, y_val_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    return clf, y_val_pred, val_accuracy, val_report, val_conf_matrix, y_test_pred, test_accuracy, test_report, test_conf_matrix

In [None]:
#Split data for 4 class classification
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

In [None]:
#XGBoost Classifier
clf_xgb, y_val_pred_xgb, y_test_pred_xgb, val_accuracy_xgb, val_report_xgb, test_accuracy_xgb, test_report_xgb, importance_df = cross_validation_xgb(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
print_classification_report("XGBoost", val_accuracy_xgb, val_report_xgb, test_accuracy_xgb, test_report_xgb)

In [None]:
important_features = importance_df['feature'].tolist()

In [None]:
#Decision Tree Classifier
clf, y_val_pred, val_accuracy_dt, val_report_dt, val_conf_matrix, y_test_pred, test_accuracy_dt, test_report_dt, test_conf_matrix = cross_validation_dt(X_train, X_val, X_test, y_train, y_val, y_test, important_features)

In [None]:
print_classification_report("Decision Tree", val_accuracy_dt, val_report_dt, test_accuracy_dt, test_report_dt)

In [None]:
#Random Forest Classifier
clf_rf, y_val_pred_rf, val_accuracy_rf, val_report_rf, val_conf_matrix_rf, y_test_pred_rf, test_accuracy_rf, test_report_rf, test_conf_matrix_rf = cross_validation_rf(X_train, X_val, X_test, y_train, y_val, y_test, important_features)


In [None]:
print_classification_report("Random Forest", val_accuracy_rf, val_report_rf, test_accuracy_rf, test_report_rf)

In [None]:
#KNN Classifier
clf_knn, y_val_pred_knn, val_accuracy_knn, val_report_knn, val_conf_matrix_knn, y_test_pred_knn, test_accuracy_knn, test_report_knn, test_conf_matrix_knn = cross_validation_knn(X_train, X_val, X_test, y_train, y_val, y_test, important_features)


In [None]:
print_classification_report("KNN", val_accuracy_knn, val_report_knn, test_accuracy_knn, test_report_knn)

In [None]:
# 16 Class Classification
X = CE_df.copy(deep=True)
y = X['subcategory_name']
X = CE_df.drop(columns=["Category", "Class_Malware", "category_name", "subcategory_name"])

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

In [None]:
#XGBoost Classifier
clf_xgb, y_val_pred_xgb, y_test_pred_xgb, val_accuracy_xgb, val_report_xgb, test_accuracy_xgb, test_report_xgb, importance_df = cross_validation_xgb(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
print_classification_report("XGBoost", val_accuracy_xgb, val_report_xgb, test_accuracy_xgb, test_report_xgb)

In [None]:
#Decision Tree Classifier
clf, y_val_pred, val_accuracy_dt, val_report_dt, val_conf_matrix, y_test_pred, test_accuracy_dt, test_report_dt, test_conf_matrix = cross_validation_dt(X_train, X_val, X_test, y_train, y_val, y_test, important_features)

In [None]:
print_classification_report("Decision Tree", val_accuracy_dt, val_report_dt, test_accuracy_dt, test_report_dt)

In [None]:
#Random Forest Classifier
clf_rf, y_val_pred_rf, val_accuracy_rf, val_report_rf, val_conf_matrix_rf, y_test_pred_rf, test_accuracy_rf, test_report_rf, test_conf_matrix_rf = cross_validation_rf(X_train, X_val, X_test, y_train, y_val, y_test, important_features)


In [None]:
print_classification_report("Random Forest", val_accuracy_rf, val_report_rf, test_accuracy_rf, test_report_rf)

In [None]:
#KNN Classifier
clf_knn, y_val_pred_knn, val_accuracy_knn, val_report_knn, val_conf_matrix_knn, y_test_pred_knn, test_accuracy_knn, test_report_knn, test_conf_matrix_knn = cross_validation_knn(X_train, X_val, X_test, y_train, y_val, y_test, important_features)


In [None]:
print_classification_report("KNN", val_accuracy_knn, val_report_knn, test_accuracy_knn, test_report_knn)