In [1]:
import pandas as pd
df = pd.read_csv('blackhole.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404134 entries, 0 to 404133
Data columns (total 18 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   time                                 404134 non-null  float64
 1   source                               404134 non-null  int64  
 2   destination                          404134 non-null  int64  
 3   length                               404134 non-null  float64
 4   info                                 404134 non-null  float64
 5   transmission_rate_per_1000_ms        404134 non-null  float64
 6   reception_rate_per_1000_ms           404134 non-null  float64
 7   transmission_average_per_sec         404134 non-null  float64
 8   reception_average_per_sec            404134 non-null  float64
 9   transmission_count_per_sec           404134 non-null  float64
 10  reception_count_per_sec              404134 non-null  float64
 11  transmission_

In [2]:
df.head()

Unnamed: 0,time,source,destination,length,info,transmission_rate_per_1000_ms,reception_rate_per_1000_ms,transmission_average_per_sec,reception_average_per_sec,transmission_count_per_sec,reception_count_per_sec,transmission_total_duration_per_sec,reception_total_duration_per_sec,dao,dis,dio,category,label
0,0.037,39,9999,0.0,1.0,0.0,0.671176,0.0,0.499879,0.0,0.671176,0.539313,0.570032,0.0,0.0,0.0,Normal,0
1,0.037,39,9999,0.0,1.0,0.0,0.649873,0.0,0.505234,0.0,0.649873,0.264704,0.530547,0.0,0.0,0.0,Normal,0
2,0.038,39,9999,0.0,1.0,0.671176,0.652361,0.462516,0.501327,0.671768,0.652361,0.546376,1.0,0.0,0.0,0.690115,Blackhole,1
3,0.045,39,9999,0.0,1.0,0.0,0.633786,0.0,0.517346,0.0,0.634105,0.585425,0.553276,0.0,0.0,0.0,Normal,0
4,0.046,39,9999,0.0,1.0,0.0,0.630378,0.0,0.538789,0.0,0.630378,0.443171,0.615377,0.0,0.0,0.0,Normal,0


In [3]:
# Check if 'label' is in the columns
if 'label' in df.columns:
    print(df['label'].value_counts())
else:
    print("'label' column does not exist in the DataFrame.")


0    269852
1    134282
Name: label, dtype: int64


data preprocessing

**Using stadard scalar**

In [44]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
import pandas as pd

scalar = StandardScaler()

# Drop the 'category' column
X = df.drop(['label', 'category'], axis=1)  # Drop both the target variable and the categorical column
y = df['label']  # Target variable

# Store column names
column_names = X.columns.tolist()

X = scalar.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

# Check value counts of 'label' column before undersampling
print("Value counts of 'label' column before undersampling:")
print(y_train.value_counts())

# Instantiate the random sampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Balance only the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Check value counts of 'label' column after undersampling
print("\nValue counts of 'label' column after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

X_scaled = scalar.fit_transform(X)

# Convert the scaled data back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=column_names)

# Convert the original data to DataFrame
X_df = pd.DataFrame(X, columns=column_names)

# Display the first 10 rows of the original and scaled data side by side
comparison_df = pd.concat([X_df.head(10), X_scaled_df.head(10)], axis=1)
comparison_df.columns = pd.MultiIndex.from_tuples([('Original', col) for col in column_names] + [('Scaled', col) for col in column_names])
print(comparison_df)


Value counts of 'label' column before undersampling:
0    215842
1    107465
Name: label, dtype: int64

Value counts of 'label' column after undersampling:
0    107465
1    107465
Name: label, dtype: int64
   Original                                            \
       time    source destination    length      info   
0 -1.423329 -0.464165    1.394319 -1.921765  2.136806   
1 -1.423329 -0.464165    1.394319 -1.921765  2.136806   
2 -1.423326 -0.464165    1.394319 -1.921765  2.136806   
3 -1.423300 -0.464165    1.394319 -1.921765  2.136806   
4 -1.423296 -0.464165    1.394319 -1.921765  2.136806   
5 -1.423277 -0.464165    1.394319 -1.921765  2.136806   
6 -1.423251 -0.464165    1.394319 -1.921765  2.136806   
7 -1.423240 -0.464165    1.394319 -1.921765  2.136806   
8 -1.423229 -0.464165    1.394319 -1.921765  2.136806   
9 -1.423222 -0.464165    1.394319 -1.921765  2.136806   

                                                            \
  transmission_rate_per_1000_ms reception_rate_

**Using Gaussian transformation**

In [21]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import PowerTransformer
import pandas as pd

# Drop the 'category' column
X = df.drop(['label', 'category'], axis=1)  # Drop both the target variable and the categorical column
y = df['label']  # Target variable

# Apply Gaussian transformation
transformer = PowerTransformer(method='yeo-johnson')
X_transformed = transformer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=23)

# Check value counts of 'label' column before undersampling
print("Value counts of 'label' column before undersampling:")
print(y_train.value_counts())

# Instantiate the random sampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Balance only the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Check value counts of 'label' column after undersampling
print("\nValue counts of 'label' column after undersampling:")
print(pd.Series(y_train_resampled).value_counts())


# Convert the transformed data back to a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=X.columns)

# Display the first 10 rows of the original and transformed data side by side
comparison_df = pd.concat([X.head(10), X_transformed_df.head(10)], axis=1)
comparison_df.columns = pd.MultiIndex.from_tuples([('Original', col) for col in X.columns] + [('Transformed', col) for col in X.columns])
print(comparison_df)

Value counts of 'label' column before undersampling:
0    215842
1    107465
Name: label, dtype: int64

Value counts of 'label' column after undersampling:
0    107465
1    107465
Name: label, dtype: int64
  Original                                                               \
      time source destination length info transmission_rate_per_1000_ms   
0    0.037     39        9999    0.0  1.0                      0.000000   
1    0.037     39        9999    0.0  1.0                      0.000000   
2    0.038     39        9999    0.0  1.0                      0.671176   
3    0.045     39        9999    0.0  1.0                      0.000000   
4    0.046     39        9999    0.0  1.0                      0.000000   
5    0.051     39        9999    0.0  1.0                      0.671176   
6    0.058     39        9999    0.0  1.0                      0.671176   
7    0.061     39        9999    0.0  1.0                      0.671176   
8    0.064     39        9999    0.0  1.0   

**Min max follwed by Gaussian**

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Drop the 'category' column
X = df.drop(['label', 'category'], axis=1)  # Drop both the target variable and the categorical column
y = df['label']  # Target variable

# Apply Min-Max scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Apply Gaussian transformation
transformer = PowerTransformer(method='yeo-johnson')
X_transformed = transformer.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=23)

# Check value counts of 'label' column before undersampling
print("Value counts of 'label' column before undersampling:")
print(y_train.value_counts())

# Instantiate the random sampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Balance only the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Check value counts of 'label' column after undersampling
print("\nValue counts of 'label' column after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

# Convert the transformed data back to a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=X.columns)

# Display the first 10 rows of the original and transformed data side by side
comparison_df = pd.concat([X.head(10), X_transformed_df.head(10)], axis=1)
comparison_df.columns = pd.MultiIndex.from_tuples([('Original', col) for col in X.columns] + [('Transformed', col) for col in X.columns])
print(comparison_df)


Value counts of 'label' column before undersampling:
0    215842
1    107465
Name: label, dtype: int64

Value counts of 'label' column after undersampling:
0    107465
1    107465
Name: label, dtype: int64
  Original                                                               \
      time source destination length info transmission_rate_per_1000_ms   
0    0.037     39        9999    0.0  1.0                      0.000000   
1    0.037     39        9999    0.0  1.0                      0.000000   
2    0.038     39        9999    0.0  1.0                      0.671176   
3    0.045     39        9999    0.0  1.0                      0.000000   
4    0.046     39        9999    0.0  1.0                      0.000000   
5    0.051     39        9999    0.0  1.0                      0.671176   
6    0.058     39        9999    0.0  1.0                      0.671176   
7    0.061     39        9999    0.0  1.0                      0.671176   
8    0.064     39        9999    0.0  1.0   

**Standardization followed by Gaussian Transformation**

In [27]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import PowerTransformer, StandardScaler
import pandas as pd

# Assuming 'df' is your DataFrame containing the data

# Drop the 'category' column
X = df.drop(['label', 'category'], axis=1)  # Drop both the target variable and the categorical column
y = df['label']  # Target variable

# Apply Standardization
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Apply Gaussian transformation
transformer = PowerTransformer(method='yeo-johnson')
X_transformed = transformer.fit_transform(X_standardized)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=23)

# Check value counts of 'label' column before undersampling
print("Value counts of 'label' column before undersampling:")
print(y_train.value_counts())

# Instantiate the random sampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Balance only the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Check value counts of 'label' column after undersampling
print("\nValue counts of 'label' column after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

# Convert the transformed data back to a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=X.columns)

# Display the first 10 rows of the original and transformed data side by side
comparison_df = pd.concat([X.head(10), X_transformed_df.head(10)], axis=1)
comparison_df.columns = pd.MultiIndex.from_tuples([('Original', col) for col in X.columns] + [('Transformed', col) for col in X.columns])
print(comparison_df)


Value counts of 'label' column before undersampling:
0    215842
1    107465
Name: label, dtype: int64

Value counts of 'label' column after undersampling:
0    107465
1    107465
Name: label, dtype: int64
  Original                                                               \
      time source destination length info transmission_rate_per_1000_ms   
0    0.037     39        9999    0.0  1.0                      0.000000   
1    0.037     39        9999    0.0  1.0                      0.000000   
2    0.038     39        9999    0.0  1.0                      0.671176   
3    0.045     39        9999    0.0  1.0                      0.000000   
4    0.046     39        9999    0.0  1.0                      0.000000   
5    0.051     39        9999    0.0  1.0                      0.671176   
6    0.058     39        9999    0.0  1.0                      0.671176   
7    0.061     39        9999    0.0  1.0                      0.671176   
8    0.064     39        9999    0.0  1.0   

**Robust scaling followed by Gaussian transformation**

10* 48 column and others are 10* 32

In [30]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import RobustScaler, PowerTransformer
import pandas as pd

# Assuming df is your dataframe
# Drop the 'category' column
X = df.drop(['label', 'category'], axis=1)  # Drop both the target variable and the categorical column
y = df['label']  # Target variable

# Apply Robust scaling
scaler = RobustScaler()
X_robust_scaled = scaler.fit_transform(X)

# Apply Gaussian transformation to the robust scaled data
transformer = PowerTransformer(method='yeo-johnson')
X_transformed = transformer.fit_transform(X_robust_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=23)

# Check value counts of 'label' column before undersampling
print("Value counts of 'label' column before undersampling:")
print(y_train.value_counts())

# Instantiate the random sampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Balance only the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Check value counts of 'label' column after undersampling
print("\nValue counts of 'label' column after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

# Convert the transformed data back to a DataFrame
X_robust_scaled_df = pd.DataFrame(X_robust_scaled, columns=X.columns)
X_transformed_df = pd.DataFrame(X_transformed, columns=X.columns)

# Display the first 10 rows of the original, robust scaled, and transformed data side by side
comparison_df = pd.concat([X.head(10), X_robust_scaled_df.head(10), X_transformed_df.head(10)], axis=1)
comparison_df.columns = pd.MultiIndex.from_tuples([('Original', col) for col in X.columns] + [('RobustScaled', col) for col in X.columns] + [('Transformed', col) for col in X.columns])
print(comparison_df)


Value counts of 'label' column before undersampling:
0    215842
1    107465
Name: label, dtype: int64

Value counts of 'label' column after undersampling:
0    107465
1    107465
Name: label, dtype: int64
  Original                                                               \
      time source destination length info transmission_rate_per_1000_ms   
0    0.037     39        9999    0.0  1.0                      0.000000   
1    0.037     39        9999    0.0  1.0                      0.000000   
2    0.038     39        9999    0.0  1.0                      0.671176   
3    0.045     39        9999    0.0  1.0                      0.000000   
4    0.046     39        9999    0.0  1.0                      0.000000   
5    0.051     39        9999    0.0  1.0                      0.671176   
6    0.058     39        9999    0.0  1.0                      0.671176   
7    0.061     39        9999    0.0  1.0                      0.671176   
8    0.064     39        9999    0.0  1.0   

**Normalization followed by Gaussian transformation**

In [35]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import PowerTransformer, Normalizer
import pandas as pd

# Assume df is your DataFrame containing the dataset

# Drop the 'category' column
X = df.drop(['label', 'category'], axis=1)  # Drop both the target variable and the categorical column
y = df['label']  # Target variable

# Apply Normalization
scaler = Normalizer()
X_normalized = scaler.fit_transform(X)

# Apply Gaussian transformation on normalized data
transformer = PowerTransformer(method='yeo-johnson')
X_normalized_transformed = transformer.fit_transform(X_normalized)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized_transformed, y, test_size=0.2, random_state=23)

# Check value counts of 'label' column before undersampling
print("Value counts of 'label' column before undersampling:")
print(y_train.value_counts())

# Instantiate the random sampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Balance only the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Check value counts of 'label' column after undersampling
print("\nValue counts of 'label' column after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

# Convert the transformed data back to a DataFrame
X_transformed_df = pd.DataFrame(X_normalized_transformed, columns=X.columns)

# Display the first 10 rows of the original and transformed data side by side
comparison_df = pd.concat([X.head(10), X_transformed_df.head(10)], axis=1)
comparison_df.columns = pd.MultiIndex.from_tuples([('Original', col) for col in X.columns] + [('Transformed', col) for col in X.columns])
print(comparison_df)


Value counts of 'label' column before undersampling:
0    215842
1    107465
Name: label, dtype: int64

Value counts of 'label' column after undersampling:
0    107465
1    107465
Name: label, dtype: int64
  Original                                                               \
      time source destination length info transmission_rate_per_1000_ms   
0    0.037     39        9999    0.0  1.0                      0.000000   
1    0.037     39        9999    0.0  1.0                      0.000000   
2    0.038     39        9999    0.0  1.0                      0.671176   
3    0.045     39        9999    0.0  1.0                      0.000000   
4    0.046     39        9999    0.0  1.0                      0.000000   
5    0.051     39        9999    0.0  1.0                      0.671176   
6    0.058     39        9999    0.0  1.0                      0.671176   
7    0.061     39        9999    0.0  1.0                      0.671176   
8    0.064     39        9999    0.0  1.0   

In [39]:
from sklearn.preprocessing import StandardScaler

# Drop the non-numeric columns if necessary
numeric_columns = ['time', 'source', 'destination', 'length', 'info', 'transmission_rate_per_1000_ms', 
                   'reception_rate_per_1000_ms', 'transmission_average_per_sec', 'reception_average_per_sec', 
                   'transmission_count_per_sec', 'reception_count_per_sec', 'transmission_total_duration_per_sec', 
                   'reception_total_duration_per_sec']

# Apply Z-score Standardization
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
import pandas as pd

# Assuming df is your DataFrame containing the dataset

# Define the columns you want to print
columns_to_print = df.columns

# Print the first 5 rows of each column
for column in columns_to_print:
    print(f"First 5 rows of column '{column}':")
    print(df[column].head())
    print("\n")


First 5 rows of column 'time':
0   -1.423329
1   -1.423329
2   -1.423326
3   -1.423300
4   -1.423296
Name: time, dtype: float64


First 5 rows of column 'source':
0   -0.464165
1   -0.464165
2   -0.464165
3   -0.464165
4   -0.464165
Name: source, dtype: float64


First 5 rows of column 'destination':
0    1.394319
1    1.394319
2    1.394319
3    1.394319
4    1.394319
Name: destination, dtype: float64


First 5 rows of column 'length':
0   -1.921765
1   -1.921765
2   -1.921765
3   -1.921765
4   -1.921765
Name: length, dtype: float64


First 5 rows of column 'info':
0    2.136806
1    2.136806
2    2.136806
3    2.136806
4    2.136806
Name: info, dtype: float64


First 5 rows of column 'transmission_rate_per_1000_ms':
0   -1.510623
1   -1.510623
2    1.117926
3   -1.510623
4   -1.510623
Name: transmission_rate_per_1000_ms, dtype: float64


First 5 rows of column 'reception_rate_per_1000_ms':
0    1.120446
1    1.038417
2    1.047998
3    0.976475
4    0.963351
Name: reception_rate_per_

**To incorporate t-Distributed Stochastic Neighbor Embedding (t-SNE) into Standard scalar**

In [4]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE  # Import t-SNE
import pandas as pd

# Assuming df is your DataFrame containing the dataset

# Instantiate StandardScaler
scalar = StandardScaler()

# Drop the 'category' column
X = df.drop(['label', 'category'], axis=1)  # Drop both the target variable and the categorical column
y = df['label']  # Target variable

# Store column names
column_names = X.columns.tolist()

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=23)  # Assuming 2 components for visualization
X_embedded = tsne.fit_transform(X)

# Split the embedded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y, test_size=0.2, random_state=23)

# Check value counts of 'label' column before undersampling
print("Value counts of 'label' column before undersampling:")
print(y_train.value_counts())

# Instantiate the random sampler for undersampling
undersampler = RandomUnderSampler(random_state=42)

# Balance only the training set
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Check value counts of 'label' column after undersampling
print("\nValue counts of 'label' column after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

# Scale the embedded data
X_scaled = scalar.fit_transform(X_embedded)

# Convert the scaled data back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=['TSNE_Component_1', 'TSNE_Component_2'])

# Display the first 10 rows of the scaled data
print("First 10 rows of the scaled data:")
print(X_scaled_df.head(10))


Value counts of 'label' column before undersampling:
0    215842
1    107465
Name: label, dtype: int64

Value counts of 'label' column after undersampling:
0    107465
1    107465
Name: label, dtype: int64
First 10 rows of the scaled data:
   TSNE_Component_1  TSNE_Component_2
0          0.570986         -1.186515
1          0.571021         -1.186688
2          0.565037         -1.035912
3          0.570986         -1.186459
4          0.570950         -1.186703
5          0.565056         -1.035949
6          0.565074         -1.035969
7          0.565088         -1.035997
8          0.504988         -1.008883
9          0.504912         -1.009299


Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier()

# Train the model
dt_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

checking Accuracy

In [6]:
# Accuracy on the training set
train_accuracy = dt_model.score(X_train_resampled, y_train_resampled)
print("Training Accuracy:", train_accuracy)

# Accuracy on the test set
test_accuracy = dt_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Training Accuracy: 1.0
Test Accuracy: 0.9604092691798533
