## **-- MACHINE LEARNING --**

# Light Gradient Boosting on ElasticNet Predictions

In [27]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import ElasticNet
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
import joblib

# Charger le dataframe
train = pd.read_csv(r'E:\CYBERSENTINEL\ML\reste.csv', encoding='ISO-8859-1', index_col=0)

# Étape 1: Prétraitement des caractéristiques numériques
num_preprocessor = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())

# Étape 2: Prétraitement des caractéristiques catégorielles
cat_preprocessor = make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'), OneHotEncoder(handle_unknown='ignore'))

# Supposons que num_features et cat_features sont les listes de vos caractéristiques numériques et catégorielles
num_features = ['sport', 'dsport', 'dur', 'dbytes', 'sttl', 'dttl', 'sloss', 'sload', 'dload', 'spkts', 'swin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt', 'dintpkt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src']
cat_features = ['srcip', 'dstip', 'proto', 'state', 'service']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preprocessor, num_features),
        ('cat', cat_preprocessor, cat_features)])

# Ensuite, vous pouvez utiliser ce préprocesseur dans votre pipeline
elastic_net = make_pipeline(preprocessor, ElasticNet(max_iter=100, random_state=1234))
lgbm = make_pipeline(preprocessor, LGBMClassifier(n_estimators=1, random_state=1234))

# Entraîner les modèles
# Division des données en ensembles d’entraînement et de test
X = train.drop('label', axis=1)
y = train['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

# Entraîner les modèles
elastic_net.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

# Grid search avec validation croisée stratifiée
skf = StratifiedKFold(n_splits=5)
parameters = {'lgbmclassifier__n_estimators': [10, 50, 100, 500]}
clf = GridSearchCV(lgbm, parameters, cv=skf)
clf.fit(X_train, y_train)

# Prédire avec le modèle LightGBM
predictions = clf.predict(X_test)

# Rapport de classification
print(classification_report(y_test, predictions))

# Enregistrer le modèle
joblib.dump(clf, 'modelcode.pkl')



[LightGBM] [Info] Number of positive: 240672, number of negative: 1470118
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.329486 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5826
[LightGBM] [Info] Number of data points in the train set: 1710790, number of used features: 270
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.140679 -> initscore=-1.809663
[LightGBM] [Info] Start training from score -1.809663
[LightGBM] [Info] Number of positive: 192538, number of negative: 1176094
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.273171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5818
[LightGBM] [Info] Number of data points in the train set: 1368632, number of used features: 270
[Light

['modelcode.pkl']

In [28]:
from sklearn import set_config
set_config(display='diagram')

# Afficher le pipeline
clf

In [30]:
from sklearn.metrics import confusion_matrix

# Calculer la matrice de confusion
cm = confusion_matrix(y_test, predictions)

# Afficher la matrice de confusion
print('Matrice de confusion :')
print(cm)
#[[vrais négatifs, faux positifs],
 #[faux négatifs, vrais positifs]]

Matrice de confusion :
[[488547   1106]
 [  1539  79072]]


In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

# Calculer la précision
precision = precision_score(y_test, predictions, average='weighted')
print(f'Precision: {precision}')

# Calculer le rappel
recall = recall_score(y_test, predictions, average='weighted')
print(f'Recall: {recall}')

# Calculer le score F1
f1 = f1_score(y_test, predictions, average='weighted')
print(f'F1 Score: {f1}')

Accuracy: 0.995361797342985
Precision: 0.9953537077557112
Recall: 0.995361797342985
F1 Score: 0.9953565770091335


# Random Forest

In [32]:
# Charger le dataframe
train = pd.read_csv(r'E:\CYBERSENTINEL\ML\reste.csv', encoding='ISO-8859-1', index_col=0)

In [34]:
from sklearn.ensemble import RandomForestClassifier

# Créer une instance de RandomForestClassifier
rf = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=0))

# Entraîner le modèle
rf.fit(X_train, y_train)

# Prédire sur l'ensemble de test
predictions2 = rf.predict(X_test)

# Afficher le rapport de classification
print(classification_report(y_test, predictions2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    489653
           1       0.99      0.99      0.99     80611

    accuracy                           1.00    570264
   macro avg       0.99      0.99      0.99    570264
weighted avg       1.00      1.00      1.00    570264



In [37]:
from sklearn.metrics import confusion_matrix

# Calculer la matrice de confusion
cm2 = confusion_matrix(y_test, predictions2)

# Afficher la matrice de confusion
print('Matrice de confusion :')
print(cm2)

accuracy2 = accuracy_score(y_test, predictions2)
print(f'Accuracy: {accuracy2}')

# Calculer la précision
precision2 = precision_score(y_test, predictions2, average='weighted')
print(f'Precision: {precision2}')

# Calculer le rappel
recall2 = recall_score(y_test, predictions2, average='weighted')
print(f'Recall: {recall2}')

# Calculer le score F1
f1 = f1_score(y_test, predictions2, average='weighted')
print(f'F1 Score: {f1}')

Matrice de confusion :
[[488714    939]
 [  1044  79567]]
Accuracy: 0.9965226631875763
Precision: 0.9965209080943219
Recall: 0.9965226631875763
F1 Score: 0.9965217164947541


In [38]:
set_config(display='diagram')

# Afficher le pipeline
rf