In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, recall_score, confusion_matrix, accuracy_score
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [18]:
df_test = pd.read_csv("GUIDE_Test.csv",nrows=100000)

In [19]:
df_test.columns = df_test.columns.str.lower()

# more then 50% null data columns
df_test = df_test.drop(columns=['resourcetype',
 'actiongrouped',
 'actiongranular',
 'threatfamily',
 'emailclusterid',
 'antispamdirection',
 'roles',
 'suspicionlevel',
 'lastverdict',
 'mitretechniques'])

# dorp null rows overall data set
df_test = df_test.dropna()

# datetime Feature extraction
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

df_test['year'] = df_test['timestamp'].dt.year
df_test['month'] = df_test['timestamp'].dt.month
df_test['day'] = df_test['timestamp'].dt.day
df_test['hour'] = df_test['timestamp'].dt.hour
df_test['minute'] = df_test['timestamp'].dt.minute

df_test.drop('timestamp', axis=1, inplace=True)


In [20]:
df_test.columns

Index(['id', 'orgid', 'incidentid', 'alertid', 'detectorid', 'alerttitle',
       'category', 'incidentgrade', 'entitytype', 'evidencerole', 'deviceid',
       'sha256', 'ipaddress', 'url', 'accountsid', 'accountupn',
       'accountobjectid', 'accountname', 'devicename', 'networkmessageid',
       'registrykey', 'registryvaluename', 'registryvaluedata',
       'applicationid', 'applicationname', 'oauthapplicationid', 'filename',
       'folderpath', 'resourceidname', 'osfamily', 'osversion', 'countrycode',
       'state', 'city', 'usage', 'year', 'month', 'day', 'hour', 'minute'],
      dtype='object')

In [21]:
column = ['id', 'orgid', 'incidentid', 'alertid', 'detectorid', 'alerttitle',
       'category', 'entitytype', 'evidencerole', 'deviceid',
       'sha256', 'ipaddress', 'url', 'accountsid', 'accountupn',
       'accountobjectid', 'accountname', 'devicename', 'networkmessageid',
       'registrykey', 'registryvaluename', 'registryvaluedata',
       'applicationid', 'applicationname', 'oauthapplicationid', 'filename',
       'folderpath', 'resourceidname', 'osfamily', 'osversion', 'countrycode',
       'state', 'city', 'usage', 'year', 'month', 'day', 'hour', 'minute', 'incidentgrade']
df_test = df_test[column]

In [22]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 40 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  100000 non-null  int64 
 1   orgid               100000 non-null  int64 
 2   incidentid          100000 non-null  int64 
 3   alertid             100000 non-null  int64 
 4   detectorid          100000 non-null  int64 
 5   alerttitle          100000 non-null  int64 
 6   category            100000 non-null  object
 7   entitytype          100000 non-null  object
 8   evidencerole        100000 non-null  object
 9   deviceid            100000 non-null  int64 
 10  sha256              100000 non-null  int64 
 11  ipaddress           100000 non-null  int64 
 12  url                 100000 non-null  int64 
 13  accountsid          100000 non-null  int64 
 14  accountupn          100000 non-null  int64 
 15  accountobjectid     100000 non-null  int64 
 16  acc

In [23]:
df_test['incidentgrade'].value_counts()

incidentgrade
BenignPositive    42352
TruePositive      36036
FalsePositive     21612
Name: count, dtype: int64

### class balancing

In [24]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Assuming your dataframe is called 'df' and the column is 'incidentgrade'

# Get the value counts
value_counts = df_test['incidentgrade'].value_counts()

# Find the minimum count
min_count = value_counts.min()

# Create a list to store the balanced dataframes
balanced_dfs = []

# Undersample each class
for class_value in value_counts.index:
    class_df = df_test[df_test['incidentgrade'] == class_value]

    if len(class_df) > min_count:
        # Undersample
        undersampled_df = resample(class_df,
                                   replace=False,    # sample without replacement
                                   n_samples=min_count,
                                   random_state=42)  # reproducible results
        balanced_dfs.append(undersampled_df)
    else:
        # If this class is already at or below the minimum, keep all samples
        balanced_dfs.append(class_df)

# Combine the balanced dataframes
df_balanced = pd.concat(balanced_dfs)

# Verify the new class distribution
print(df_balanced['incidentgrade'].value_counts())

incidentgrade
BenignPositive    21612
TruePositive      21612
FalsePositive     21612
Name: count, dtype: int64


In [25]:
df_balanced.head(5)

Unnamed: 0,id,orgid,incidentid,alertid,detectorid,alerttitle,category,entitytype,evidencerole,deviceid,...,countrycode,state,city,usage,year,month,day,hour,minute,incidentgrade
12498,343597387241,2,9491,6224,18,14,Exfiltration,MailMessage,Impacted,98799,...,242,1445,10630,Public,2024,6,5,21,51,BenignPositive
3848,1632087577464,614,1479,13942,569,637,Exfiltration,Machine,Impacted,98799,...,242,1445,10630,Private,2024,6,3,12,56,BenignPositive
85198,17179873653,182,12770,11179,111,90,Exfiltration,MailMessage,Impacted,98799,...,242,1445,10630,Public,2024,6,7,16,58,BenignPositive
70416,1477468751788,39,235667,446812,6,5,InitialAccess,Mailbox,Impacted,98799,...,242,1445,10630,Public,2024,6,13,6,20,BenignPositive
27974,1005022347871,43,242972,306817,172,111,CredentialAccess,User,Impacted,98799,...,242,1445,10630,Public,2024,6,10,11,59,BenignPositive


### LabelEncoder

In [26]:
categorical_column = df_test.select_dtypes(include='object').columns

le = LabelEncoder()
for col in categorical_column:
    df_test[col] = le.fit_transform(df_test[col])

X = df_test.iloc[:,:-1]
y = df_test.iloc[:,-1]

### StandardScaler

In [27]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [29]:
X

array([[ 0.81097468,  1.29198782, -0.47787476, ..., -0.91615884,
         1.44093868,  1.524479  ],
       [ 1.12478867, -0.46682096,  0.19319365, ..., -1.07728308,
        -0.0370381 ,  1.63953385],
       [ 0.88071112, -0.08493893, -0.30476344, ..., -0.27166191,
        -1.3672172 , -0.54650821],
       ...,
       [ 0.88071112, -0.29470511, -0.21358415, ..., -0.11053767,
         0.25855726, -0.54650821],
       [ 0.96788167, -0.44530647,  2.8829362 , ..., -0.59391037,
        -0.62822881, -0.66156306],
       [ 1.08992045, -0.38076303, -0.57314516, ..., -0.43278614,
         0.11075958, -0.37392595]])

### Train_Test_Split

In [36]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


### LogisticRegression

In [37]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report_dict = classification_report(y_test,y_pred)
print(report_dict)

              precision    recall  f1-score   support

           0       0.60      0.76      0.67      8471
           1       0.60      0.24      0.34      4322
           2       0.66      0.69      0.67      7207

    accuracy                           0.62     20000
   macro avg       0.62      0.56      0.56     20000
weighted avg       0.62      0.62      0.60     20000



### RandomForestClassifier

In [12]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

report = classification_report(y_test, y_pred_rf)
print(report)

              precision    recall  f1-score   support

           0       0.86      0.96      0.91      8471
           1       0.94      0.83      0.88      4322
           2       0.95      0.89      0.92      7207

    accuracy                           0.91     20000
   macro avg       0.92      0.89      0.90     20000
weighted avg       0.91      0.91      0.91     20000



### XGBClassifier

In [15]:
xgb_model = XGBClassifier()

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

xgb_report = classification_report(y_test,y_pred)
print(xgb_report)

              precision    recall  f1-score   support

           0       0.48      0.79      0.60      8471
           1       0.00      0.00      0.00      4322
           2       0.54      0.45      0.49      7207

    accuracy                           0.50     20000
   macro avg       0.34      0.41      0.36     20000
weighted avg       0.40      0.50      0.43     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### GradientBoostingClassifier

In [18]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

gb_report = classification_report(y_test,y_pred)
print(gb_report)

              precision    recall  f1-score   support

           0       0.48      0.79      0.60      8471
           1       0.00      0.00      0.00      4322
           2       0.54      0.45      0.49      7207

    accuracy                           0.50     20000
   macro avg       0.34      0.41      0.36     20000
weighted avg       0.40      0.50      0.43     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### parllel algoritham process

In [48]:
# lr_params = {
#     'random_state': None,
#     'multi_class': 'auto',
#     'solver': 'lbfgs',
#     'max_iter': 100
# }
rf_params = {
    'n_estimators': 200,          # Increase the number of trees in the forest
    'random_state': 42,           # Seed for reproducibility
    'max_depth': 10,              # Limit the depth of the trees to prevent overfitting
    'min_samples_split': 4,       # Increase the minimum number of samples required to split an internal node
    'min_samples_leaf': 2,        # Increase the minimum number of samples required to be at a leaf node
    'bootstrap': False,           # Disable bootstrapping (using the whole dataset for each tree)
    'max_features': 'sqrt'        # Consider the square root of features at each split (can help reduce overfitting)
}

# knn_params = {
#     'n_neighbors': 2,  # Number of neighbors to use
#     'weights': 'uniform',  # Weight function used in prediction
#     'algorithm': 'auto',  # Algorithm used to compute the nearest neighbors
#     'p': 2  # Power parameter for the Minkowski metric (2 for Euclidean distance)
# }
xgb_params = {
    'n_estimators': 100,  # Number of boosting rounds
    'max_depth': 3,  # Maximum depth of trees
    'learning_rate': 0.1,  # Step size shrinkage
    'subsample': 0.8,  # Proportion of samples to use for each tree
    'colsample_bytree': 0.8,  # Proportion of features to use for each tree
    'objective': 'multi:softmax',  # Multi-class classification objective
    'num_class': 3,  # Number of classes
    'eval_metric': 'mlogloss'  # Evaluation metric
}
gb_params = {
    'n_estimators': 100,  # Number of boosting stages to be run
    'learning_rate': 0.1,  # Learning rate shrinks the contribution of each tree
    'max_depth': 3,  # Maximum depth of the individual trees
    'subsample': 1.0,  # Proportion of samples to use for fitting each individual tree
    'criterion': 'friedman_mse'  # Function to measure the quality of a split (default is 'friedman_mse')
}
models = [#(
#     "Logistic regression",
#     LogisticRegression(**lr_params),
#     (X_train,y_train),
#     (X_test,y_test)
# ),(
    ("Random Forest",
    RandomForestClassifier(),
    (X_train,y_train),
    (X_test,y_test)
),#(
#     "KNN",
#     KNeighborsClassifier(**knn_params),
#     (X_train,y_train),
#     (X_test,y_test)
# ),(
   ( "XGBClassifier",
    XGBClassifier(),
    (X_train,y_train),
    (X_test,y_test)
),(
    "GradientBoostingClassifier",
    GradientBoostingClassifier(),
    (X_train,y_train),
    (X_test,y_test)
)]

In [49]:
reports = []
for model_name,model,train_set,test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test,y_pred,output_dict=True)
    reports.append(report)

In [47]:
mlflow.set_experiment("microsoft_classification_test_class_balanced_SC_HPT")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i ,element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name",model_name)
        mlflow.log_metric("accuracy",report["accuracy"])
        mlflow.log_metric("recall_class_0",report['0']['recall'])
        mlflow.log_metric("recall_class_1",report['1']['recall'])
        mlflow.log_metric("recall_class_2",report['2']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])

        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, model_name)
        else:
            mlflow.sklearn.log_model(model, model_name)    
   

2024/09/04 16:59:19 INFO mlflow.tracking.fluent: Experiment with name 'microsoft_classification_test_class_balanced_SC_HPT' does not exist. Creating a new experiment.
2024/09/04 16:59:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/299744645742495692/runs/45eeb091791d44279ccde14271e5d6fb.
2024/09/04 16:59:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/299744645742495692.
2024/09/04 16:59:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier at: http://127.0.0.1:5000/#/experiments/299744645742495692/runs/53942e6a5024445ea7db0e888983229b.
2024/09/04 16:59:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/299744645742495692.
2024/09/04 16:59:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run GradientBoostingClassifier at: http://127.0.0.1:5000/#/experiments/299744645742495692