In [None]:
import joblib
from sklearn.metrics import accuracy_score
import mlflow 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
from scipy.stats import mannwhitneyu, chi2_contingency
from tqdm.auto import tqdm
from tqdm.notebook import tqdm
import joblib
from copy import deepcopy
import pip install dataframe_image as dfi
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, fbeta_score, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

In [None]:
df = pd.read_csv(r"C:\Users\pavan\OneDrive\Desktop\sem2\ML\Project\Android_Malware.csv", low_memory=False)

In [None]:
df.columns

# Inspection and Dropping Columns, Missing Values, Duplicates

In [None]:
df.columns = df.columns.str.strip()


In [None]:
df.columns

In [None]:
df.iloc[:,[56,58,63]]

In [None]:
df.iloc[:,58].value_counts()

In [None]:
df.iloc[:,56].value_counts()

In [None]:
df.iloc[:,63].value_counts()

In [None]:
df.loc[df.iloc[:,58] == 'BENIGN']

In [None]:
columns = ['CWE Flag Count', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',
           'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'ECE Flag Count',
           'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'RST Flag Count']

for column in columns:
    unique_counts = df[column].value_counts()
    print(f"Unique values and their counts for {column}:\n{unique_counts}\n")

In [None]:

df = df.drop(columns=['Unnamed: 0','CWE Flag Count','Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk','Fwd Avg Bulk Rate',
                      'Bwd Avg Bytes/Bulk','Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','ECE Flag Count',
                      'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'RST Flag Count',
                      'Timestamp', 'Flow ID'])

df = df.drop(index=276556)

In [None]:

print("Original DataFrame:")
print(df.shape)

null_rows = df[df.isnull().any(axis=1)]

In [None]:

print("Rows to be dropped:")
print(null_rows.index)


In [None]:
df.shape

In [None]:
df = df.dropna()
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:

print("Resulting DataFrame:")
print(df.shape)

In [None]:
df.info()

In [None]:

df[['Protocol', 'Fwd PSH Flags', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count',
    'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio']] \
= df[['Protocol', 'Fwd PSH Flags', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count',
      'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio']].apply(pd.to_numeric)


In [None]:
df.info()

In [None]:

df[['Protocol', 'Fwd PSH Flags', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count',
    'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio']] \
= df[['Protocol', 'Fwd PSH Flags', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 
      'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio']].astype(int)

In [None]:
df.info()

In [None]:
df[['Protocol','Fwd PSH Flags','FIN Flag Count','SYN Flag Count',
    'PSH Flag Count','ACK Flag Count','URG Flag Count']] \
= df[['Protocol','Fwd PSH Flags','FIN Flag Count','SYN Flag Count',
    'PSH Flag Count','ACK Flag Count','URG Flag Count']].astype('object')

In [None]:
df.info()

In [None]:
df.head(1)

In [None]:

def ip_to_decimal(ip):
    # Split the IP address into octets
    octets = ip.split('.')
    binary = '{0:08b}{1:08b}{2:08b}{3:08b}'.format(*map(int, octets))
    decimal = int(binary, 2) # Convert binary to decimal
    return decimal

In [None]:

df['Source IP Decimal'] = df['Source IP'].apply(ip_to_decimal)
df['Destination IP Decimal'] = df['Destination IP'].apply(ip_to_decimal)

In [None]:
df.head(1)

In [None]:

df['Target'] = df['Label'].apply(lambda x: 1 if x.startswith('Android_') else 0)

In [None]:
df.Target.value_counts(normalize=True)

In [None]:

df = df.drop(columns=['Source IP', 'Destination IP','Label'] )

In [None]:
df.head(4)

# Feature Selection

In [None]:
df.hist(figsize = (20,20))
plt.tight_layout()
plt.show()

In [None]:
df.info()

In [None]:

numeric_cols = df.select_dtypes(include='number').columns

In [None]:

keep_num_df = pd.DataFrame(columns=['Feature', 'Statistic', 'p-value'])
exclude_num_df = pd.DataFrame(columns=['Feature', 'Statistic', 'p-value'])


In [None]:

for col in numeric_cols[:-1]:
    malware = df.loc[df['Target'] == 1, col]
    benign = df.loc[df['Target'] == 0, col]
    stat, p = mannwhitneyu(malware, benign)
    if p < 0.05:
        keep_num_df = pd.concat([keep_num_df, pd.DataFrame({'Feature': [col], 'Statistic': [stat], 'p-value': [p.round(3)]})], ignore_index=True)
    else:
        exclude_num_df = pd.concat([exclude_num_df, pd.DataFrame({'Feature': [col], 'Statistic': [stat], 'p-value': [p.round(3)]})], ignore_index=True)

In [None]:

print("Features to keep:")
print(keep_num_df)

print("\nFeatures to exclude:")
print(exclude_num_df)

In [None]:
print(keep_num_df.shape)
print(exclude_num_df.shape)

In [None]:

object_cols = df.select_dtypes(include='object').columns

In [None]:

keep_cat_df = pd.DataFrame(columns=['Feature', 'Chi2 Statistic', 'p-value'])
exclude_cat_df = pd.DataFrame(columns=['Feature', 'Chi2 Statistic', 'p-value'])

In [None]:

for col in object_cols:
    contingency_table = pd.crosstab(df[col], df['Target'])
    chi2, p, dof, _ = chi2_contingency(contingency_table)
    if p < 0.05:
        keep_cat_df = pd.concat([keep_cat_df, pd.DataFrame({'Feature': [col], 'Chi2 Statistic': [chi2], 'p-value': [p.round(5)]})], ignore_index=True)
    else:
        exclude_cat_df = pd.concat([exclude_cat_df, pd.DataFrame({'Feature': [col], 'Chi2 Statistic': [chi2], 'p-value': [p.round(3)]})], ignore_index=True)

In [None]:
# Print results
print("Features to keep:")
print(keep_cat_df)
print("\nFeatures to exclude:")
print(exclude_cat_df)

In [None]:

for col in df.select_dtypes(include=['int', 'float', 'object']).columns:
    top_value = df[col].value_counts(normalize=True).nlargest(1)
    if top_value.iloc[0] > 0.5:
        print(f'Column {col}:\n\t top value = {top_value.index[0]}\n\t frequency = {top_value.iloc[0]:.2f}\n\t type: {df[col].dtype}\n')
        

In [None]:
df.describe(include='object').T

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
normalize the final data

# Preprocessing - Training - Evaluation

In [None]:
keep_features = list(keep_num_df.Feature) + list(keep_cat_df.Feature)

In [None]:
ignore_features = list(exclude_cat_df.Feature)+list(exclude_num_df.Feature)

In [None]:
targ_var = 'Target'
X = df[keep_features]
y = df[targ_var]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
df[keep_features].info()

In [None]:
df[targ_var].info()

In [None]:

categorical_cols = ['Protocol', 'PSH Flag Count', 'ACK Flag Count']
numeric_cols = ['Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 
                'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Mean', 
                'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 
                'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Max', 'Fwd Header Length', 'Bwd Header Length', 
                'Min Packet Length', 'Packet Length Std', 'Packet Length Variance', 'Down/Up Ratio', 'Avg Fwd Segment Size', 
                'Avg Bwd Segment Size', 'Fwd Header Length.1', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 
                'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward', 
                'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Max', 'Idle Min', 
                'Source IP Decimal', 'Destination IP Decimal']

In [None]:

classifiers = [
    ('RandomForestClassifier', RandomForestClassifier(random_state=42)),
    ('XGBClassifier', XGBClassifier(random_state=42)),
    ('LGBMClassifier', LGBMClassifier(random_state=42))
]


In [None]:
preprocessors = [
    ('StandardScaler', ColumnTransformer(transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ], remainder='passthrough')),
    ('MinMaxScaler', ColumnTransformer(transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', MinMaxScaler(), numeric_cols)
    ], remainder='passthrough')),
    ('RobustScaler', ColumnTransformer(transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', RobustScaler(), numeric_cols)
    ], remainder='passthrough'))
]

samplers = [
    ('SMOTE', SMOTE(random_state=42))
]


In [None]:

best_pipeline = None
best_score = -np.inf
results = []

In [None]:

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

In [None]:

scoring = {
    'fbeta': make_scorer(fbeta_score, beta=2, average='weighted'),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted'),
    'accuracy': make_scorer(accuracy_score, average='weighted')
}


In [None]:

for clf_name, classifier in tqdm(classifiers, desc='Classifier', leave=False):
    for pre_name, preprocessor in tqdm(preprocessors, desc='Preprocessor', leave=False):
        for samp_name, sampler in tqdm(samplers, desc='Sampler', leave=False):
            # Create a pipeline
            pipeline = make_pipeline(
                preprocessor,
                sampler,
                classifier
            )
            
            # Calculate the cross-validated scores
            cv_scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1)
            
            # Calculate and store the results
            results.append({
                'Classifier': clf_name,
                'Preprocessor': pre_name,
                'Sampler': samp_name,
                'CV f2 Score': np.mean(cv_scores['test_fbeta']),
                'CV F1': np.mean(cv_scores['test_f1']),
                'CV Recall': np.mean(cv_scores['test_recall']),
                'CV Precision': np.mean(cv_scores['test_precision']),
                'CV Accuracy': np.mean(cv_scores['test_accuracy'])  
            })

            # Update the best values if the f0.5 score is higher
            if np.mean(cv_scores['test_fbeta']) > best_score:
                best_score = np.mean(cv_scores['test_fbeta'])
                best_pipeline = pipeline
        


In [None]:

best_pipeline.fit(X_train, y_train)


joblib.dump(best_pipeline, 'best_pipeline.pkl')


results_df = pd.DataFrame(results)

display(results_df)

In [None]:
from imblearn.metrics import classification_report_imbalanced

# Assuming 'best_pipeline' is the pipeline with the best configuration
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)

# Generate and print the imbalanced classification report
class_report_imbalanced = classification_report_imbalanced(y_test, y_pred)
print(class_report_imbalanced)


In [None]:
import optuna

In [None]:
def objective(trial):
    # Retrieve the preprocessor and sampler from the best_pipeline
    preprocessor = best_pipeline.named_steps['columntransformer']
    sampler = best_pipeline.named_steps['smote']

    # Suggest hyperparameters for RandomForestClassifier
    n_estimators = trial.suggest_int('n_estimators', 100, 200)
    max_depth = trial.suggest_int('max_depth', 25, 34)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 1)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

  
    classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                        max_features=max_features, random_state=42, verbose=1, n_jobs=-1)

    pipeline = make_pipeline(preprocessor, sampler, classifier)

  
    cv_scores = cross_validate(pipeline, X_train, y_train, scoring=make_scorer(fbeta_score, beta=2, average='weighted'), cv=cv, n_jobs=-1)
    trial.set_user_attr("pipeline", pipeline)


    return np.mean(cv_scores['test_score'])


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10) 


best_trial = study.best_trial
best_pipeline_optimized = best_trial.user_attrs["pipeline"]

print("Best hyperparameters:", best_trial.params)
print("Best F2 score:", best_trial.value)

# Fit the updated best_pipeline to the entire training set
best_pipeline_optimized.fit(X_train, y_train)

# Save the updated best_pipeline to a pickle file
joblib.dump(best_pipeline_optimized, 'best_pipeline_optimized.pkl')

print('Done')

In [None]:

rfc_base = best_pipeline.named_steps['randomforestclassifier']
rfc_optimized = best_pipeline_optimized.named_steps['randomforestclassifier']


In [None]:
print("Steps in best_pipeline.pkl:", best_pipeline.named_steps.keys())
print("Steps in best_pipeline_optimized.pkl:", best_pipeline_optimized.named_steps.keys())


In [None]:
from imblearn.metrics import classification_report_imbalanced


best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)


class_report_imbalanced = classification_report_imbalanced(y_test, y_pred)
print(class_report_imbalanced)

In [None]:

n_estimators_base = rfc_base.n_estimators
min_samples_split_base = rfc_base.min_samples_split
min_samples_leaf_base = rfc_base.min_samples_leaf
max_features_base = rfc_base.max_features
max_depth_base = max([tree.tree_.max_depth for tree in rfc_base.estimators_])


n_estimators_optimized = rfc_optimized.n_estimators
min_samples_split_optimized = rfc_optimized.min_samples_split
min_samples_leaf_optimized = rfc_optimized.min_samples_leaf
max_features_optimized = rfc_optimized.max_features
max_depth_optimized = max([tree.tree_.max_depth for tree in rfc_optimized.estimators_])

# Find the index of the best_pipeline in the results DataFrame
best_pipeline_index = results_df.loc[results_df['CV f2 Score'] == best_score].index[0]


# Extract the mean cross-validated F2 score of the best_pipeline
# Get the best F2 scores from the hyperparameter tuning process
f2_score_base = results_df.loc[best_pipeline_index, 'CV f2 Score']
f2_score_optimized = best_trial.value


# Create a dictionary with the attributes as keys and their values as lists
attributes_dict = {
    'f2_score_training': [f2_score_base, f2_score_optimized],
    'max_depth': [max_depth_base, max_depth_optimized],
    'n_estimators': [n_estimators_base, n_estimators_optimized],
    'min_samples_split': [min_samples_split_base, min_samples_split_optimized],
    'min_samples_leaf': [min_samples_leaf_base, min_samples_leaf_optimized],
    'max_features': [max_features_base, max_features_optimized],
}

# Convert the dictionary into a DataFrame
comparison_df = pd.DataFrame(attributes_dict, index=['rfc_base', 'rfc_optimized'])

# Display the DataFrame
comparison_df

In [None]:
# Results DataFrame after hyperparameter tuning
results_after_tuning = {
    'Classifier': ['RandomForestClassifier'],
    'Preprocessor': ['Optimized'],
    'Sampler': ['Optimized'],
    'CV f2 Score': [best_trial.value],
    'CV F1': [np.nan],  # Replace with actual F1 score after tuning
    'CV Recall': [np.nan],  # Replace with actual recall after tuning
    'CV Precision': [np.nan],  # Replace with actual precision after tuning
    'CV Accuracy': [np.nan],  # Replace with actual accuracy after tuning
}

In [None]:
# Results DataFrame before hyperparameter tuning
results_df_before_tuning = pd.DataFrame(results)

# Print or display the results before tuning
print("Results Before Hyperparameter Tuning:")
display(results_df_before_tuning)

# ... (rest of your code)

# Results DataFrame after hyperparameter tuning
results_after_tuning = {
    'Classifier': ['RandomForestClassifier'],
    'Preprocessor': ['Optimized'],
    'Sampler': ['Optimized'],
    'CV f2 Score': [best_trial.value],
    'CV F1': [np.nan],  # Replace with actual F1 score after tuning
    'CV Recall': [np.nan],  # Replace with actual recall after tuning
    'CV Precision': [np.nan],  # Replace with actual precision after tuning
    'CV Accuracy': [np.nan],  # Replace with actual accuracy after tuning
}

results_df_after_tuning = pd.DataFrame(results_after_tuning)

# Compare the results before and after tuning
comparison_df = pd.concat([results_df_before_tuning, results_df_after_tuning], ignore_index=True)

# Display the comparison DataFrame
print("Comparison of Results Before and After Hyperparameter Tuning:")
display(comparison_df)


In [None]:
from sklearn.dummy import DummyClassifier

# Create a DummyClassifier with the 'stratified' strategy
dummy_clf = DummyClassifier(strategy='stratified', random_state=42)

# Preprocess the X_train and X_test data using the preprocessor from best_pipeline_optimized
preprocessor = best_pipeline_optimized.named_steps['columntransformer']
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Apply the sampler to the preprocessed X_train data
sampler = best_pipeline_optimized.named_steps['smote']
X_train_preprocessed_resampled, y_train_resampled = sampler.fit_resample(X_train_preprocessed, y_train)

# Fit the DummyClassifier on the preprocessed and resampled training data and make predictions
dummy_clf.fit(X_train_preprocessed_resampled, y_train_resampled)
y_pred_dummy = dummy_clf.predict(X_test_preprocessed)

# Make predictions using best_pipeline_optimized
y_pred_optimized = best_pipeline_optimized.predict(X_test)

# Calculate F2 scores
f2_score_dummy = fbeta_score(y_test, y_pred_dummy, beta=2, average='weighted')
f2_score_optimized = fbeta_score(y_test, y_pred_optimized, beta=2, average='weighted')

# Create a DataFrame with the F2 scores
f2_test_scores_df = pd.DataFrame({
    'Classifier': ['Dummy Classifier', 'rfc_optimized'],
    'F2 Score': [f2_score_dummy, f2_score_optimized]
})

# Display the DataFrame
display(f2_test_scores_df)


In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_curve, average_precision_score


def display_values(ax, cm, fontsize=16):
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, f'{cm[i, j] * 100:.1f}%', ha='center', va='center', color='black', fontsize=fontsize)

# Calculate confusion matrices
cm_dummy = confusion_matrix(y_test, y_pred_dummy, normalize='all', labels=[1, 0])
cm_optimized = confusion_matrix(y_test, y_pred_optimized, normalize='all', labels=[1, 0])

# Transpose confusion matrices and swap columns
cm_dummy = cm_dummy.T
cm_optimized = cm_optimized.T

# Plot confusion matrices
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

fig.suptitle("Comparing Baseline with Model", fontsize=25, x=.5, y=1.05)

cmap = plt.get_cmap('Spectral')
norm = plt.Normalize(vmin=0, vmax=1)

# Plot dummy classifier confusion matrix
im = ax1.imshow(cm_dummy, cmap=cmap, norm=norm)
ax1.set_title('Dummy Classifier Confusion Matrix')
ax1.set_xticks([0, 1])
ax1.set_yticks([0, 1])
ax1.xaxis.tick_top()
ax1.xaxis.set_ticks_position('both')
ax1.xaxis.set_label_position('top')
ax1.set_xlabel('Actual')
ax1.set_xticklabels(['Malware (1)', 'Benign (0)'])
ax1.set_yticklabels(['Malware (1)', 'Benign (0)'])
ax1.set_ylabel('Predicted')
ax1.add_patch(Rectangle((-.485, .5), .98, 1, fill=False, edgecolor='black', lw=4))
ax1.annotate('False Negative', xy=(0.02, .9), xycoords='data', fontsize=12, ha='center', va='bottom', color='black')
ax1.annotate('False Positive', xy=(1, -.1), xycoords='data', fontsize=12, ha='center', va='bottom', color='black')

# Plot optimized pipeline confusion matrix
ax2.imshow(cm_optimized, cmap=cmap, norm=norm)
ax2.set_title('Optimized Pipeline Confusion Matrix')
ax2.set_xticks([0, 1])
ax2.set_yticks([0, 1])
ax2.xaxis.tick_top()
ax2.xaxis.set_ticks_position('both')
ax2.xaxis.set_label_position('top')
ax2.set_xlabel('Actual')
ax2.set_xticklabels(['Malware (1)', 'Benign (0)'])
ax2.set_yticklabels(['Malware (1)', 'Benign (0)'])
ax2.set_ylabel('Predicted')
ax2.add_patch(Rectangle((-.48, .5), .98, 1, fill=False, edgecolor='black', lw=4))
ax2.annotate('False Negative', xy=(0.02, .9), xycoords='data', fontsize=12, ha='center', va='bottom', color='black')
ax2.annotate('False Positive', xy=(1, -.1), xycoords='data', fontsize=12, ha='center', va='bottom', color='black')

display_values(ax1, cm_dummy)
display_values(ax2, cm_optimized)

plt.subplots_adjust(wspace=0.3)

cbar_ax = fig.add_axes([0.93, 0.15, 0.03, 0.7])
fig.colorbar(im, cax=cbar_ax, format='%.2f')
plt.savefig('confusion_matrices.png')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

# Compute prediction probabilities
y_pred_dummy_prob = dummy_clf.predict_proba(X_test_preprocessed)
y_pred_optimized_prob = best_pipeline_optimized.predict_proba(X_test)

# Compute precision-recall curve
precision_dummy, recall_dummy, _ = precision_recall_curve(y_test, y_pred_dummy_prob[:, 1])
precision_optimized, recall_optimized, _ = precision_recall_curve(y_test, y_pred_optimized_prob[:, 1])

# Calculate F2 score for the baseline classifier
f2_score_baseline = fbeta_score(y_test, y_pred_dummy, beta=2)

# Calculate Average Precision (AP) scores
ap_dummy = average_precision_score(y_test, y_pred_dummy_prob[:, 1])
ap_optimized = average_precision_score(y_test, y_pred_optimized_prob[:, 1])

# Plot the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall_dummy, precision_dummy, label=f'Dummy Classifier (AP = {ap_dummy:.2f})', linestyle='--')
plt.plot(recall_optimized, precision_optimized, label=f'Optimized Model (AP = {ap_optimized:.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.grid()
plt.savefig('precision_recall_curve.png')
plt.show()

In [None]:
# Get the feature importances from the Random Forest model
rfc = best_pipeline_optimized.named_steps['randomforestclassifier']
feature_importances = rfc.feature_importances_

# Get the feature names from the preprocessor
preprocessor_feature_importance = best_pipeline_optimized.named_steps['columntransformer']
feature_names = preprocessor_feature_importance.get_feature_names_out()

# Create a DataFrame with feature names and their corresponding importances
importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Plot the top 10 most important features
top_n = 10
plt.figure(figsize=(12, 8))
plt.bar(importances_df['Feature'].str.replace('num__','')[:top_n], importances_df['Importance'][:top_n])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Top 10 Most Important Features')
plt.xticks(rotation=45)

# Add the Flow IAT annotation
annotation = ("Flow IAT (Inter-Arrival Time) measures the time between the arrival of data packets in a network.\n")
plt.annotate(annotation, xy=(0.2, 0.85), xycoords='axes fraction', fontsize=10, ha='left', va='bottom', wrap=True)

plt.subplots_adjust(bottom=0.2)  # Adjust the bottom margin
plt.savefig('important_features.png')

plt.show()

In [None]:
# Create a new column with the desired class labels
df['Class'] = df['Target'].map({0: 'Benign', 1: 'Malware'})

# Create the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Flow IAT Max', y='Flow IAT Min', hue='Class', palette='viridis', alpha=0.7)
plt.title("Most Important Features (Actual Labels)")
plt.savefig('best_features_original.png')
plt.show()

In [None]:
# Get the feature indices for Flow IAT Max and Flow IAT Min from the preprocessor
flow_iat_max_index = list(feature_names).index('num__Flow IAT Max')
flow_iat_min_index = list(feature_names).index('num__Flow IAT Min')

# Extract Flow IAT Max and Flow IAT Min from the preprocessed test data
flow_iat_max_preprocessed = X_test_preprocessed[:, flow_iat_max_index]
flow_iat_min_preprocessed = X_test_preprocessed[:, flow_iat_min_index]

# Create a DataFrame with Flow IAT Max, Flow IAT Min, and the predicted labels
flow_iat_predicted_df = pd.DataFrame({
    'Flow IAT Max': flow_iat_max_preprocessed,
    'Flow IAT Min': flow_iat_min_preprocessed,
    'Predicted Label': y_pred_optimized
})

# Create the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=flow_iat_predicted_df, x='Flow IAT Max', y='Flow IAT Min', hue='Predicted Label', palette='viridis', alpha=0.7)
plt.title("Scatterplot of 'Flow IAT Max' vs 'Flow IAT Min' (Predicted Labels)")
plt.show()

In [None]:
original_df = df.head(10).iloc[:, :10].style.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(odd)', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(even)', 'props': [('background-color', '#303030'), ('color', '#f0f0f0')]},
    ]).hide(axis="index")

In [None]:
model_selection_df = results_df.sort_values(by='CV f2 Score',ascending=False)
model_selection_df = model_selection_df.style.background_gradient(subset='CV f2 Score', cmap='viridis').set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(odd)', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(even)', 'props': [('background-color', '#303030'), ('color', '#f0f0f0')]},
    ]).hide(axis="index")
model_selection_df

In [None]:
base_tune_model_comparison_df = comparison_df.style.background_gradient(subset=['max_depth','n_estimators'], cmap='viridis').set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(odd)', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(even)', 'props': [('background-color', '#303030'), ('color', '#f0f0f0')]},
    ])
base_tune_model_comparison_df

In [None]:
f2_test_scores_df.style.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(odd)', 'props': [('background-color', '#424242'), ('color', '#f0f0f0')]},
        {'selector': 'tr:nth-child(even)', 'props': [('background-color', '#303030'), ('color', '#f0f0f0')]},
    ]).hide(axis="index").highlight_max(subset=['F2 Score'], color='#78C257')

In [None]:
# Assuming `new_row` is the row without the target variable
# You should replace the values in `new_row` with the actual values you want to predict
new_row = X_test.iloc[0, :]

# Convert the dictionary to a DataFrame
new_data = pd.DataFrame([new_row])

# Ensure the column order and data types match the training data
new_data = new_data[keep_features]  # Assuming `keep_features` is the list of features used in training

# Use the same preprocessing steps as in the training phase
preprocessor = best_pipeline_optimized.named_steps['columntransformer']
new_data_preprocessed = preprocessor.transform(new_data)


# Use the trained and optimized pipeline to predict the target
predicted_target = best_pipeline_optimized.predict(new_data_preprocessed)

# Map the numeric prediction to the corresponding label
predicted_label = 'Malware' if predicted_target == 1 else 'Benign'

# Display the result
print(f"The predicted label for the given row is: {predicted_label}")


In [None]:
f_row=X_test.iloc[0, :]
new_data = pd.DataFrame([f_row])

In [None]:
new_data.isnull().sum()

In [None]:
print(new_data.dtypes)