In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, recall_score, confusion_matrix, accuracy_score
import mlflow
import mlflow.sklearn
import mlflow.xgboost

In [52]:
df_flow= pd.read_csv('GUIDE_Train.csv',nrows=100000)

# pre processing

In [53]:
df_flow.columns = df_flow.columns.str.lower()

# more then 50% null data columns
df_flow = df_flow.drop(columns=['resourcetype',
 'actiongrouped',
 'actiongranular',
 'threatfamily',
 'emailclusterid',
 'antispamdirection',
 'roles',
 'suspicionlevel',
 'lastverdict',
 'mitretechniques'])

# dorp null rows overall data set
df_flow = df_flow.dropna()

# datetime Feature extraction
df_flow['timestamp'] = pd.to_datetime(df_flow['timestamp'])

df_flow['year'] = df_flow['timestamp'].dt.year
df_flow['month'] = df_flow['timestamp'].dt.month
df_flow['day'] = df_flow['timestamp'].dt.day
df_flow['hour'] = df_flow['timestamp'].dt.hour
df_flow['minute'] = df_flow['timestamp'].dt.minute

df_flow.drop('timestamp', axis=1, inplace=True)


In [54]:
# assign the target column to last index
clumn = ['id', 'orgid', 'incidentid', 'alertid', 'detectorid', 'alerttitle',
       'category', 'entitytype', 'evidencerole', 'deviceid',
       'ipaddress', 'url', 'accountsid', 'accountupn', 'devicename',
       'networkmessageid', 'registrykey', 'registryvaluename',
       'registryvaluedata', 'applicationid', 'oauthapplicationid', 'filename',
       'resourceidname', 'osfamily', 'state', 'year', 'month', 'day', 'hour',
       'minute', 'incidentgrade']

df_flow = df_flow[clumn]

In [55]:
df_flow.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99475 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  99475 non-null  int64 
 1   orgid               99475 non-null  int64 
 2   incidentid          99475 non-null  int64 
 3   alertid             99475 non-null  int64 
 4   detectorid          99475 non-null  int64 
 5   alerttitle          99475 non-null  int64 
 6   category            99475 non-null  object
 7   entitytype          99475 non-null  object
 8   evidencerole        99475 non-null  object
 9   deviceid            99475 non-null  int64 
 10  ipaddress           99475 non-null  int64 
 11  url                 99475 non-null  int64 
 12  accountsid          99475 non-null  int64 
 13  accountupn          99475 non-null  int64 
 14  devicename          99475 non-null  int64 
 15  networkmessageid    99475 non-null  int64 
 16  registrykey         99475 n

In [56]:
df_flow['incidentgrade'].value_counts()

incidentgrade
BenignPositive    43024
TruePositive      34887
FalsePositive     21564
Name: count, dtype: int64

# class balancing

In [57]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Assuming your dataframe is called 'df' and the column is 'incidentgrade'

# Get the value counts
value_counts = df_flow['incidentgrade'].value_counts()

# Find the minimum count
min_count = value_counts.min()

# Create a list to store the balanced dataframes
balanced_dfs = []

# Undersample each class
for class_value in value_counts.index:
    class_df = df_flow[df_flow['incidentgrade'] == class_value]

    if len(class_df) > min_count:
        # Undersample
        undersampled_df = resample(class_df,
                                   replace=False,    # sample without replacement
                                   n_samples=min_count,
                                   random_state=42)  # reproducible results
        balanced_dfs.append(undersampled_df)
    else:
        # If this class is already at or below the minimum, keep all samples
        balanced_dfs.append(class_df)

# Combine the balanced dataframes
df_balanced = pd.concat(balanced_dfs)

# Verify the new class distribution
print(df_balanced['incidentgrade'].value_counts())

incidentgrade
BenignPositive    21564
TruePositive      21564
FalsePositive     21564
Name: count, dtype: int64


In [71]:
df_final = pd.read_csv('train_FE_out_cls_blnc.csv')

In [72]:
df_final.head(3)

Unnamed: 0,id,orgid,incidentid,alertid,detectorid,alerttitle,category,entitytype,evidencerole,deviceid,...,filename,resourceidname,osfamily,state,month,year,day,hour,minute,incidentgrade
0,214748366556,0,114,1344358,14,12,CredentialAccess,CloudLogonRequest,Related,98799,...,289573,3586,5,1445,6,2024,6,16,20,2
1,352187320015,0,122,949228,0,0,InitialAccess,CloudLogonSession,Related,98799,...,289573,3586,5,1445,6,2024,14,11,52,2
2,884763264312,216,82016,119304,6,5,InitialAccess,MailCluster,Related,98799,...,289573,3586,5,1445,6,2024,11,13,9,2


## LabelEncoder

In [87]:
categorical_column = df_final.select_dtypes(include='object').columns

le = LabelEncoder()
for col in categorical_column:
    df_final[col] = le.fit_transform(df_final[col])

X = df_final.iloc[:,:-1]
y = df_final.iloc[:,-1]


# train_test_split

In [88]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


# LogisticRegression

In [75]:
# Define the parameters in a dictionary
params = {
    'random_state': None,
    'multi_class': 'auto',
    'solver': 'lbfgs',
    'max_iter': 100
}
# Initialize the Logistic Regression model with the defined parameters
model = LogisticRegression(**params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report_dict = classification_report(y_test,y_pred)
print(report_dict)



              precision    recall  f1-score   support

           0       0.52      0.80      0.63      1462
           1       0.58      0.48      0.52      1462
           2       0.71      0.46      0.56      1462

    accuracy                           0.58      4386
   macro avg       0.60      0.58      0.57      4386
weighted avg       0.60      0.58      0.57      4386



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# random forest

In [106]:
#Define the parameters for Random Forest in a dictionary
rf_params = {
    'n_estimators': 100,
    'random_state': None,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'bootstrap': True
}
rf_model = RandomForestClassifier(**rf_params)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

report = classification_report(y_test, y_pred_rf)
print(report)

              precision    recall  f1-score   support

           0       0.89      0.94      0.91      1462
           1       0.92      0.92      0.92      1462
           2       1.00      0.94      0.97      1462

    accuracy                           0.93      4386
   macro avg       0.94      0.93      0.93      4386
weighted avg       0.94      0.93      0.93      4386



In [105]:
y_pred_ = rf_model.predict(X_test_)

report = classification_report(y_test_, y_pred_)
print(report)

              precision    recall  f1-score   support

           0       0.50      0.99      0.67       434
           1       0.74      0.13      0.22       206
           2       0.97      0.31      0.47       360

    accuracy                           0.57      1000
   macro avg       0.74      0.48      0.45      1000
weighted avg       0.72      0.57      0.50      1000



In [90]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

# Print the average accuracy and standard deviation
print("Cross-validation Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std()))


Cross-validation Accuracy: 0.93 (+/- 0.00)


# KNeighborsClassifier

In [77]:
# Define the parameters for KNN in a dictionary
knn_params = {
    'n_neighbors': 2,  # Number of neighbors to use
    'weights': 'uniform',  # Weight function used in prediction
    'algorithm': 'auto',  # Algorithm used to compute the nearest neighbors
    'p': 2  # Power parameter for the Minkowski metric (2 for Euclidean distance)
}

# Initialize the KNN model with the defined parameters
knn_model = KNeighborsClassifier(**knn_params)
knn_model.fit(X_train, y_train)

y_pred_knn = knn_model.predict(X_test)

knn_report = classification_report(y_test,y_pred)
print(knn_report)

              precision    recall  f1-score   support

           0       0.52      0.80      0.63      1462
           1       0.58      0.48      0.52      1462
           2       0.71      0.46      0.56      1462

    accuracy                           0.58      4386
   macro avg       0.60      0.58      0.57      4386
weighted avg       0.60      0.58      0.57      4386



# XGBClassifier

In [79]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_params = {
    'n_estimators': 100,  # Number of boosting rounds
    'max_depth': 3,  # Maximum depth of trees
    'learning_rate': 0.1,  # Step size shrinkage
    'subsample': 0.8,  # Proportion of samples to use for each tree
    'colsample_bytree': 0.8,  # Proportion of features to use for each tree
    'objective': 'multi:softmax',  # Multi-class classification objective
    'num_class': 3,  # Number of classes
    'eval_metric': 'mlogloss'  # Evaluation metric
}

xgb_model = XGBClassifier()

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

xgb_report = classification_report(y_test,y_pred)
print(xgb_report)

              precision    recall  f1-score   support

           0       0.52      0.80      0.63      1462
           1       0.58      0.48      0.52      1462
           2       0.71      0.46      0.56      1462

    accuracy                           0.58      4386
   macro avg       0.60      0.58      0.57      4386
weighted avg       0.60      0.58      0.57      4386



# GradientBoostingClassifier

In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Define the parameters for Gradient Boosting in a dictionary
gb_params = {
    'n_estimators': 100,  # Number of boosting stages to be run
    'learning_rate': 0.1,  # Learning rate shrinks the contribution of each tree
    'max_depth': 3,  # Maximum depth of the individual trees
    'subsample': 1.0,  # Proportion of samples to use for fitting each individual tree
    'criterion': 'friedman_mse'  # Function to measure the quality of a split (default is 'friedman_mse')
}

gb_model = GradientBoostingClassifier(**gb_params)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

gb_report = classification_report(y_test,y_pred)
print(gb_report)

NameError: name 'X_train' is not defined

In [82]:
lr_params = {
    'random_state': None,
    'multi_class': 'auto',
    'solver': 'lbfgs',
    'max_iter': 100
}
rf_params = {
    'n_estimators': 100,
    'random_state': None,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'bootstrap': True
}
knn_params = {
    'n_neighbors': 2,  # Number of neighbors to use
    'weights': 'uniform',  # Weight function used in prediction
    'algorithm': 'auto',  # Algorithm used to compute the nearest neighbors
    'p': 2  # Power parameter for the Minkowski metric (2 for Euclidean distance)
}
xgb_params = {
    'n_estimators': 100,  # Number of boosting rounds
    'max_depth': 3,  # Maximum depth of trees
    'learning_rate': 0.1,  # Step size shrinkage
    'subsample': 0.8,  # Proportion of samples to use for each tree
    'colsample_bytree': 0.8,  # Proportion of features to use for each tree
    'objective': 'multi:softmax',  # Multi-class classification objective
    'num_class': 3,  # Number of classes
    'eval_metric': 'mlogloss'  # Evaluation metric
}
gb_params = {
    'n_estimators': 100,  # Number of boosting stages to be run
    'learning_rate': 0.1,  # Learning rate shrinks the contribution of each tree
    'max_depth': 3,  # Maximum depth of the individual trees
    'subsample': 1.0,  # Proportion of samples to use for fitting each individual tree
    'criterion': 'friedman_mse'  # Function to measure the quality of a split (default is 'friedman_mse')
}
models = [(
    "Logistic regression",
    LogisticRegression(),
    lr_params,
    (X_train,y_train),
    (X_test,y_test)
),(
    "Random Forest",
    RandomForestClassifier(),
    rf_params,
    (X_train,y_train),
    (X_test,y_test)
),(
    "KNN",
    KNeighborsClassifier(),
    knn_params,
    (X_train,y_train),
    (X_test,y_test)
),(
    "XGBClassifier",
    XGBClassifier(),
    xgb_params,
    (X_train,y_train),
    (X_test,y_test)
),(
    "GradientBoostingClassifier",
    GradientBoostingClassifier(),
    gb_params,
    (X_train,y_train),
    (X_test,y_test)
)]

In [83]:
reports = []
for model_name,model,params,train_set,test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test,y_pred,output_dict=True)
    reports.append(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# multiple model log

In [86]:
mlflow.set_experiment("train_final")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i ,element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]
    {}

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name",model_name)

        mlflow.log_metric("accuracy",report["accuracy"])
        mlflow.log_metric("recall_class_0",report['0']['recall'])
        mlflow.log_metric("recall_class_1",report['1']['recall'])
        mlflow.log_metric("recall_class_2",report['2']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])

        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, model_name)
        else:
            mlflow.sklearn.log_model(model, model_name)    
   

2024/09/05 11:54:54 INFO mlflow.tracking.fluent: Experiment with name 'train_final' does not exist. Creating a new experiment.


2024/09/05 11:54:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic regression at: http://127.0.0.1:5000/#/experiments/339399093014260649/runs/1196c8060f574a2182fb08860114f1fd.
2024/09/05 11:54:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/339399093014260649.
2024/09/05 11:55:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/339399093014260649/runs/1910e560bbb344c5b94845463f288622.
2024/09/05 11:55:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/339399093014260649.
2024/09/05 11:55:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN at: http://127.0.0.1:5000/#/experiments/339399093014260649/runs/cc58603d58f84ae2870a2624c9dde214.
2024/09/05 11:55:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/339399093014260649.

# single model log

In [None]:
mlflow.set_experiment("first Expriment")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metrics({
        "accuracy" : report_dict['accuracy'],
        "recall_class_0" : report_dict['0']['recall'],
        "recall_class_1" : report_dict['1']['recall'],
        "recall_class_2" : report_dict['2']['recall'],
        "f1_score_macro" : report_dict['macro avg']['f1-score']
        
    })
    mlflow.sklearn.log_model(rf_model,"Random forest")

2024/09/02 17:04:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-kit-813 at: http://127.0.0.1:5000/#/experiments/344090298825934818/runs/9ace973b3e7e4bef870370bbced208e2.
2024/09/02 17:04:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/344090298825934818.


# Random forest grid search

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_rf_model = grid_search.best_estimator_

# Predict on the test data
y_pred_best_rf = best_rf_model.predict(X_test)

# Generate classification report
best_rf_report = classification_report(y_test, y_pred_best_rf)

print("Best Parameters from Grid Search:")
print(grid_search.best_params_)
print("\nClassification Report of Best Model:")
print(best_rf_report)


# KNN k-value finding

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data (if not already split)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the range of k values to test
k_values = range(1, 31)

# Store the mean error for each k value
mean_errors = []

for k in k_values:
    # Initialize the KNN model with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Fit the model on the training data
    knn.fit(X_train, y_train)
    
    # Predict on the validation data
    y_pred = knn.predict(X_val)
    
    # Calculate the mean squared error
    error = mean_squared_error(y_val, y_pred)
    
    # Store the error
    mean_errors.append(error)

# Find the k value with the minimum error
best_k = k_values[np.argmin(mean_errors)]
print(f"Best k value: {best_k}")

# Plotting the error vs. k values
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(k_values, mean_errors, marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Mean Squared Error')
plt.title('Error vs. k Value')
plt.show()
