In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import os

In [9]:
# Load and preprocess data
data = []
labels = []
for i, folder in enumerate(['Google Search', 'Google Drive', 'Google Music', 'YouTube', 'Google Doc']):
    for file in os.listdir(f'dataset/{folder}'):
        file_data = pd.read_csv(f'dataset/{folder}/{file}', header=None, sep='\t', names=['timestamp', 'relativetime', 'packetsize', 'packetdirection'])
        data.extend(file_data.values)
        labels.extend([i] * len(file_data))

data = pd.DataFrame(data, columns=['timestamp', 'relativetime', 'packetsize', 'packetdirection'])

In [10]:
print(data.head())

      timestamp  relativetime  packetsize  packetdirection
0  1.522714e+09      0.000000       315.0              1.0
1  1.522714e+09      0.012437        74.0              0.0
2  1.522714e+09      0.032781       140.0              0.0
3  1.522714e+09      0.032784       104.0              0.0
4  1.522714e+09      0.032786       112.0              0.0


In [11]:
X = data[['relativetime', 'packetsize', 'packetdirection']]
y = labels

X

Unnamed: 0,relativetime,packetsize,packetdirection
0,0.000000,315.0,1.0
1,0.012437,74.0,0.0
2,0.032781,140.0,0.0
3,0.032784,104.0,0.0
4,0.032786,112.0,0.0
...,...,...,...
42838195,116.772000,247.0,1.0
42838196,116.792000,67.0,0.0
42838197,116.920000,239.0,0.0
42838198,116.920000,128.0,0.0


In [14]:
y

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [15]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Set LightGBM parameters
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'multi_logloss',
    'max_depth': 10,
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [17]:
# Train the LightGBM model with cross-validation
lgbm = lgb.LGBMClassifier(**lgbm_params)
scores = cross_val_score(lgbm, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {scores.mean():.2f} +/- {scores.std():.2f}')

Cross-validation accuracy: 0.83 +/- 0.00


In [18]:
# Fit the model on the entire training set
lgbm.fit(X_train, y_train)



In [19]:
# Evaluate the model
y_pred = lgbm.predict(X_test)
print(f'Test accuracy: {accuracy_score(y_test, y_pred):.2f}')
print(classification_report(y_test, y_pred))

Test accuracy: 0.83
              precision    recall  f1-score   support

           0       0.80      0.91      0.85    255362
           1       0.90      0.84      0.87   3455902
           2       0.55      0.84      0.66    839718
           3       0.85      0.80      0.83   3402444
           4       0.91      0.81      0.86    614214

    accuracy                           0.83   8567640
   macro avg       0.80      0.84      0.81   8567640
weighted avg       0.84      0.83      0.83   8567640



In [1]:
import joblib

In [2]:
# Save the trained model
joblib.dump(lgbm, 'traffic_classifier.pkl')

NameError: name 'lgbm' is not defined

In [21]:
from collections import Counter

# Load the saved model
loaded_model = joblib.load('traffic_classifier.pkl')

test_cases = ['GoogleDoc-3.txt', 'GoogleDrive-test1.txt', 'GoogleMusic-8.txt', 'GoogleSearch-12.txt', 'Youtube-20.txt']

for i in test_cases:
    
    # Load the test data
    test_data = pd.read_csv(i, header=None, sep='\t', names=['timestamp', 'relativetime', 'packetsize', 'packetdirection'])

    # Prepare the test data
    X_test = test_data[['relativetime', 'packetsize', 'packetdirection']]

    # Make predictions
    predictions = loaded_model.predict(X_test)

    # Print the predictions
    classes_result = [] 
    print('Predictions:')
    for j, pred in enumerate(predictions):
        class_name = ['Google Search', 'Google Drive', 'Google Music', 'YouTube', 'Google Docs'][pred]
        classes_result.append(class_name)
        # print(f'Sample {i}: {class_name}')
    print(f'{i} results: {class_name}\t',Counter(classes_result))

Predictions:
GoogleDoc-3.txt results: Google Docs	 Counter({'Google Docs': 1136, 'Google Drive': 280, 'YouTube': 167, 'Google Music': 37, 'Google Search': 22})
Predictions:
GoogleDrive-test1.txt results: Google Drive	 Counter({'Google Drive': 13481, 'Google Docs': 82, 'YouTube': 58, 'Google Search': 53, 'Google Music': 35})
Predictions:
GoogleMusic-8.txt results: Google Music	 Counter({'Google Music': 4184, 'YouTube': 193, 'Google Drive': 6, 'Google Docs': 4, 'Google Search': 1})
Predictions:
GoogleSearch-12.txt results: Google Docs	 Counter({'Google Drive': 428, 'YouTube': 330, 'Google Docs': 98, 'Google Search': 19, 'Google Music': 4})
Predictions:
Youtube-20.txt results: YouTube	 Counter({'YouTube': 15456, 'Google Music': 5685, 'Google Drive': 4071, 'Google Docs': 254, 'Google Search': 14})


In [16]:
print(output)

["0 results: Counter({'Google Drive': 1})", "1 results: Counter({'Google Drive': 1, 'Google Search': 1})", "2 results: Counter({'Google Drive': 1, 'Google Search': 1, 'YouTube': 1})", "3 results: Counter({'YouTube': 2, 'Google Drive': 1, 'Google Search': 1})", "4 results: Counter({'YouTube': 3, 'Google Drive': 1, 'Google Search': 1})", "5 results: Counter({'YouTube': 4, 'Google Drive': 1, 'Google Search': 1})", "6 results: Counter({'YouTube': 4, 'Google Drive': 2, 'Google Search': 1})", "7 results: Counter({'YouTube': 4, 'Google Drive': 2, 'Google Search': 1, 'Google Docs': 1})", "8 results: Counter({'YouTube': 5, 'Google Drive': 2, 'Google Search': 1, 'Google Docs': 1})", "9 results: Counter({'YouTube': 6, 'Google Drive': 2, 'Google Search': 1, 'Google Docs': 1})", "10 results: Counter({'YouTube': 6, 'Google Drive': 3, 'Google Search': 1, 'Google Docs': 1})", "11 results: Counter({'YouTube': 6, 'Google Drive': 4, 'Google Search': 1, 'Google Docs': 1})", "12 results: Counter({'YouTube'

In [30]:
from collections import Counter
import numpy as np

# Load the saved model
loaded_model = joblib.load('traffic_classifier.pkl')

test_cases = ['GoogleDoc-3.txt', 'GoogleDrive-test1.txt', 'GoogleMusic-8.txt', 'GoogleSearch-7.txt', 'Youtube-20.txt']

for i in test_cases:
    # Load the test data
    test_data = pd.read_csv(i, header=None, sep='\t', names=['timestamp', 'relativetime', 'packetsize', 'packetdirection'])
    
    # Prepare the test data
    X_test = test_data[['relativetime', 'packetsize', 'packetdirection']]
    
    # Make predictions
    predictions = loaded_model.predict(X_test)
    
    # Calculate prediction accuracy rates
    total_predictions = len(predictions)
    class_counts = Counter(predictions)
    class_accuracy_rates = {class_name: count / total_predictions for class_name, count in class_counts.items()}
    
    # Print the prediction accuracy rates
    print(f'{i} prediction accuracy rates:')
    for j, accuracy_rate in class_accuracy_rates.items():
        class_n = ['Google Search', 'Google Drive', 'Google Music', 'YouTube', 'Google Docs'][j]
        print(f'{class_n}: {accuracy_rate:.2f}')


GoogleDoc-3.txt prediction accuracy rates:
Google Docs: 0.69
Google Drive: 0.17
YouTube: 0.10
Google Search: 0.01
Google Music: 0.02
GoogleDrive-test1.txt prediction accuracy rates:
Google Drive: 0.98
Google Music: 0.00
YouTube: 0.00
Google Search: 0.00
Google Docs: 0.01
GoogleMusic-8.txt prediction accuracy rates:
Google Drive: 0.00
YouTube: 0.04
Google Music: 0.95
Google Docs: 0.00
Google Search: 0.00
GoogleSearch-7.txt prediction accuracy rates:
Google Search: 0.94
YouTube: 0.04
Google Music: 0.00
Google Docs: 0.01
Google Drive: 0.01
Youtube-20.txt prediction accuracy rates:
Google Drive: 0.16
Google Search: 0.00
YouTube: 0.61
Google Docs: 0.01
Google Music: 0.22
