# Random forest z dopracownym skryptem do generowania danych pod walidację

In [None]:
!pip install scikit-learn
!pip install imblearn
!pip install scikit-learn
!pip install xgboost

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Wczytanie danych z plików CSV
ddos_df = pd.read_csv('/content/ddos-tcp-syn-flood.csv')
normal_df = pd.read_csv('/content/normal-traffic.csv')
port_scan_df = pd.read_csv('/content/port-scanning.csv')

# Połączenie danych w jedną ramkę danych
data = pd.concat([ddos_df, normal_df, port_scan_df], ignore_index=True)

# Konwersja frame-time na milisekundy
data['frame-time'] = pd.to_datetime(data['frame-time']).astype(int) / 10**6

# Przetwarzanie danych
def preprocess_data(df):
    # Label encoding dla kolumny Attack_type
    label_encoder = LabelEncoder()
    df['Attack_type'] = label_encoder.fit_transform(df['Attack_type'])
    
    # Wybór cech (X) i etykiety (y)
    X = df.drop(['Attack_type'], axis=1)
    y = df['Attack_type']
    
    # Normalizacja danych
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, y, scaler, label_encoder

X, y, scaler, label_encoder = preprocess_data(data)

# Zbalansowanie danych przy użyciu SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Podział danych na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Budowa modelu Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Trenowanie modelu
model.fit(X_train, y_train)

# Ewaluacja modelu na zbiorze testowym
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
test_report = classification_report(y_test, y_pred_test, target_names=label_encoder.classes_)
test_conf_matrix = confusion_matrix(y_test, y_pred_test)

print(f'Test accuracy: {test_accuracy:.4f}')
print('Test Classification Report:')
print(test_report)
print('Test Confusion Matrix:')
print(test_conf_matrix)

# Zapisanie modelu i przetworników
with open('/content/network_attack_detector_rf.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('/content/scaler_rf.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('/content/label_encoder_rf.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Model został wytrenowany i zapisany.")

# Generowanie danych do walidacji

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def random_date(start, end):
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))

start_date = datetime(2024, 5, 13, 16, 0, 0)
end_date = datetime(2024, 5, 13, 19, 2, 37)

def generate_sample(traffic_type):
    frame_time = random_date(start_date, end_date).strftime('%Y-%m-%dT%H:%M:%S.%f%z')
    if traffic_type == "Normal":
        ip_src_host = random.randint(0, 10)
        ip_dst_host = random.randint(0, 10)
        tcp_connection_syn = random.choice([0.0, 1.0])
        tcp_connection_synack = random.choice([0.0, 1.0])
        tcp_dstport = random.uniform(0, 7000)
        tcp_len = random.uniform(0, 1500)
        tcp_seq = random.uniform(0, 500000)
    elif traffic_type == "DDoS_TCP":
        ip_src_host = random.randint(0, 300000)
        ip_dst_host = random.randint(0, 300000)
        tcp_connection_syn = random.choice([0.0, 1.0])
        tcp_connection_synack = random.choice([0.0, 1.0])
        tcp_dstport = random.uniform(1000, 70000)
        tcp_len = random.choice([0.0, 120.0])
        tcp_seq = random.choice([0.0, 1.0])
    elif traffic_type == "Port_Scanning":
        ip_src_host = random.randint(0, 10)
        ip_dst_host = random.randint(0, 10)
        tcp_connection_syn = random.choice([0.0, 1.0])
        tcp_connection_synack = random.choice([0.0, 1.0])
        tcp_dstport = random.uniform(0, 10000)
        tcp_len = random.uniform(0, 0)
        tcp_seq = random.choice([0.0, 1.0])
    else:
        raise ValueError("Invalid traffic type")
    
    return [frame_time, ip_src_host, ip_dst_host, tcp_connection_syn, tcp_connection_synack, tcp_dstport, tcp_len, tcp_seq, traffic_type]

def generate_random_data(num_samples):
    data = []
    for _ in range(num_samples):
        traffic_type = random.choice(["Normal", "DDoS_TCP", "Port_Scanning"])
        data.append(generate_sample(traffic_type))
    
    columns = ['frame-time', 'ip-src_host', 'ip-dst_host', 'tcp-connection-syn', 'tcp-connection-synack', 'tcp-dstport', 'tcp-len', 'tcp-seq', 'Attack_type']
    return pd.DataFrame(data, columns=columns)

validation_data = generate_random_data(10000)
validation_data.to_csv('/content/validation-data.csv', index=False)

# Walidacja modelu

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Wczytanie nowego zbioru danych walidacyjnych
validation_df = pd.read_csv('/content/validation-data.csv')

# Wczytanie zapisanych modelu, skalera i kodera etykiet
with open('/content/network_attack_detector_rf.pkl', 'rb') as f:
    model = pickle.load(f)
with open('/content/scaler_rf.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open('/content/label_encoder_rf.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Przetwarzanie nowego zbioru danych walidacyjnych
validation_df['frame-time'] = pd.to_datetime(validation_df['frame-time']).astype(int) / 10**6
X_validation = validation_df.drop(['Attack_type'], axis=1)
y_true = label_encoder.transform(validation_df['Attack_type'])

# Normalizacja danych walidacyjnych
X_validation = scaler.transform(X_validation)

# Przewidywanie etykiet na danych walidacyjnych
y_pred_validation = model.predict(X_validation)

# Mapowanie przewidywanych etykiet na oryginalne etykiety
y_pred_labels = label_encoder.inverse_transform(y_pred_validation)

# Dodanie przewidywanych etykiet do danych walidacyjnych
validation_df['Predicted_Attack_type'] = y_pred_labels

# Wyświetlenie przykładowych wyników
print(validation_df.head())

# Ocena wyników
accuracy = accuracy_score(y_true, y_pred_validation)
report = classification_report(y_true, y_pred_validation, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(y_true, y_pred_validation)

# Wyświetlenie wyników oceny
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)

# Zapisywanie wyników oceny do pliku tekstowego
with open('/content/validation_evaluation.txt', 'w') as f:
    f.write(f'Accuracy: {accuracy:.4f}\n')
    f.write('Classification Report:\n')
    f.write(report)
    f.write('Confusion Matrix:\n')
    f.write(np.array2string(conf_matrix))

print("Ocena wyników walidacji została zapisana do pliku validation_evaluation.txt")