Importando as bibliotecas principais:

In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score

Instalando o gdown para baixar arquivos do Google Drive (neste caso, vamos fazer o download do arquivo csv):

In [2]:
!pip install gdown



In [3]:
import gdown

Download do arquivo:

In [4]:
url = "https://drive.google.com/u/1/uc?id=1DEJpczRY0AtIDcVjCo8FZXIi3sarxgwv&export=download"
output = 'df_total.csv'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/u/1/uc?id=1DEJpczRY0AtIDcVjCo8FZXIi3sarxgwv&export=download
To: /content/df_total.csv
100%|██████████| 55.5M/55.5M [00:00<00:00, 82.0MB/s]


'df_total.csv'

Mostrando as primeiras linhas do dataset:

In [5]:
df = pd.read_csv('df_total.csv')
df.head()

Unnamed: 0,timestamp,mac_src,mac_dst,len,device_src_name
0,1474553000.0,d0:52:a8:00:67:5e,14:cc:20:51:33:ea,1.0,Device1
1,1474553000.0,70:ee:50:18:34:43,14:cc:20:51:33:ea,54.0,Camera1
2,1474553000.0,14:cc:20:51:33:ea,18:b7:9e:02:20:44,2.0,Gateway
3,1474553000.0,14:cc:20:51:33:ea,70:ee:50:18:34:43,9.0,Gateway
4,1474553000.0,14:cc:20:51:33:ea,70:ee:50:18:34:43,213.0,Gateway


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 815815 entries, 0 to 815814
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   timestamp        815815 non-null  float64
 1   mac_src          815815 non-null  object 
 2   mac_dst          815815 non-null  object 
 3   len              815815 non-null  float64
 4   device_src_name  815815 non-null  object 
dtypes: float64(2), object(3)
memory usage: 31.1+ MB


Transformando os valores da coluna device_src_name em inteiros:

In [18]:
le = LabelEncoder()

for i in df.columns:
  le = LabelEncoder()
  df[i] = le.fit_transform(df[i].values)

df

Unnamed: 0,timestamp,mac_src,mac_dst,len,device_src_name
0,0,16,8,0,7
1,1,12,8,53,3
2,2,4,10,1,8
3,3,4,16,8,8
4,4,4,16,212,8
...,...,...,...,...,...
815810,815810,7,8,89,6
815811,815811,0,8,121,5
815812,815812,4,0,137,8
815813,815813,3,8,90,19


Separando a coluna que vamos classificar em um dataset isolado, contendo apenas esta coluna:

In [19]:
df_x = df.drop('device_src_name', axis=1)
df_y = df['device_src_name']

df_y

0          7
1          3
2          8
3          8
4          8
          ..
815810     6
815811     5
815812     8
815813    19
815814     6
Name: device_src_name, Length: 815815, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, random_state=0, test_size=0.3)
print('Número de casos de treino: ', X_train.shape[0])
print('Número de casos de teste: ', X_test.shape[0])

Número de casos de treino:  571070
Número de casos de teste:  244745


Usando classificador Decision Tree:

In [21]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier().fit(X_train, y_train)

dt_predictions = dt_model.predict(X_test)
print('Acurácia do modelo: ', accuracy_score(y_test, dt_predictions))

Acurácia do modelo:  1.0


In [22]:
print('Classification Report'.center(70, '='))
print(classification_report(y_test, dt_predictions))

print('Matriz de confusão'.center(70, '='))
print(confusion_matrix(y_test, dt_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9348
           1       1.00      1.00      1.00       556
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00      5505
           4       1.00      1.00      1.00      1907
           5       1.00      1.00      1.00     16750
           6       1.00      1.00      1.00     58473
           7       1.00      1.00      1.00      5893
           8       1.00      1.00      1.00    117585
           9       1.00      1.00      1.00      8022
          10       1.00      1.00      1.00      4233
          11       1.00      1.00      1.00      1277
          12       1.00      1.00      1.00        60
          13       1.00      1.00      1.00       836
          14       1.00      1.00      1.00      1011
          15       1.00      1.00      1.00        70
          16       1.00      1.00      1.00        54
          17       1.00    

Usando MLP:

In [23]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=1000, random_state=0)

mlp.fit(X_train, y_train)
mlp_predictions = mlp.predict(X_test)

print('Acurácia: ', accuracy_score(y_test, mlp_predictions))

Acurácia:  0.4809618174017855
