In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import math
import scipy.stats as st
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib
import seaborn as sns

import copy
import os
import pprint
import time

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Starter functions

# Load data

In [2]:
df_train = pd.read_json(os.path.join('dataset', 'train.json'), lines=True)

In [3]:
df_test = pd.read_json(os.path.join('dataset', 'test.json'), lines=True)

In [4]:
df_train

Unnamed: 0,device_class,device_id,dhcp,ip,mac,mdns_services,services,ssdp,upnp
0,IP_PHONE,5347ada9-925c-400e-8a7c-9aedd3c142f6,,192.168.0.147,80:5e:c0:41:ad:39,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...",,
1,MEDIA_BOX,2717684b-3937-4644-a33a-33f4226c43ec,,192.168.1.109,44:e1:37:a2:ec:c1,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...","[{'st': '', 'nt': 'upnp:rootdevice', 'location...","[{'model_name': 'Verizon Media Server', 'model..."
2,AUDIO,f1fc42f4-c794-4cc5-ac13-a5097d722d92,,192.168.1.121,00:00:00:00:00:00,"[_airplay._tcp.local., _spotify-connect._tcp.l...","[{'port': 80, 'protocol': 'tcp'}, {'port': 80,...","[{'st': '', 'nt': 'urn:schemas-upnp-org:servic...","[{'model_name': 'Denon AVR-X2400H', 'manufactu..."
3,GAME_CONSOLE,74ab1b5b-3cb6-4aee-8362-f3b2f016574c,,192.168.0.7,40:99:22:41:ac:a6,[_spotify-connect._tcp.local.],"[{'port': 5353, 'protocol': 'udp'}, {'port': 4...",,
4,GENERIC_IOT,1fe43d89-329d-40fe-b948-d1cbe0fe6c96,"[{'paramlist': '1,28,2,3,15,6,119,12,44,47,26,...",192.168.15.26,00:07:25:15:3a:fb,,,,
5,VOICE_ASSISTANT,8bb8971c-5983-4baa-9753-f0ac21faf162,,192.168.1.80,ac:63:be:a5:50:43,"[_workstation._tcp.local., _ssh._tcp.local., _...","[{'port': 22, 'protocol': 'tcp'}, {'port': 407...",,
6,AUDIO,0acd15a9-c52a-4980-bf30-a8c8f5042412,,192.168.254.24,00:0e:58:fc:40:60,[_spotify-connect._tcp.local.],"[{'port': 1400, 'protocol': 'tcp'}, {'port': 1...","[{'st': '', 'nt': 'uuid:RINCON_000E58FC4084014...","[{'model_name': 'Sonos Play:3', 'model_descrip..."
7,VOICE_ASSISTANT,6d559ddb-84ac-4e47-83bb-e3bff6e53afe,,192.168.178.29,40:b4:cd:41:f0:cc,"[_workstation._tcp.local., _ssh._tcp.local., _...","[{'port': 22, 'protocol': 'tcp'}, {'port': 407...",,
8,IP_PHONE,c69cc92a-a813-4312-9993-7816994bc04d,,172.26.17.234,00:08:5d:22:fd:c5,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...",,
9,IP_PHONE,db9bdc04-0322-424a-a3d1-57432f1c7326,,172.26.17.238,00:08:5d:22:fd:d2,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...",,


# Feature lookup

Interesting features:
- services: list of used ports & protocols
- upnp: dict of device info --> names, types, manufactuer, ...
- mdns_services: can directly point on device type

Examples:

In [5]:
df_train['ssdp'][2]

[{'st': '',
  'nt': 'urn:schemas-upnp-org:service:RenderingControl:1',
  'location': 'http://192.168.1.121:60006/upnp/desc/aios_device/aios_device.xml',
  'server': 'LINUX UPnP/1.0 Denon-Heos/139046',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'urn:schemas-upnp-org:service:AVTransport:1',
  'location': 'http://192.168.1.121:60006/upnp/desc/aios_device/aios_device.xml',
  'server': 'LINUX UPnP/1.0 Denon-Heos/139046',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'uuid:cf2583ce-f896-14cc-0080-0005cde2625e',
  'location': 'http://192.168.1.121:60006/upnp/desc/aios_device/aios_device.xml',
  'server': 'LINUX UPnP/1.0 Denon-Heos/139046',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'urn:schemas-denon-com:device:AiosDevice:1',
  'location': 'http://192.168.1.121:60006/upnp/desc/aios_device/aios_device.xml',
  'server': 'LINUX UPnP/1.0 Denon-Heos/139046',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'uuid:5cdaa5bc-2db2-1022-0080-0005cde2625e',


## services col

Just make cols for each port and protocol.

In [6]:
ports = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    if not isinstance(row['services'], list): continue
    for d in row['services']:
        if d['port'] not in ports:
            ports.append(d['port'])
ports.sort()
print(len(ports))  # Count of unique ports
print(ports)  # Unique port list

13379
[3, 7, 9, 13, 16, 17, 19, 20, 21, 22, 23, 24, 25, 30, 32, 44, 45, 49, 50, 51, 53, 67, 68, 69, 70, 71, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 92, 94, 96, 99, 100, 101, 102, 110, 111, 114, 123, 125, 127, 134, 135, 137, 138, 139, 161, 162, 182, 199, 200, 201, 202, 209, 230, 245, 256, 311, 319, 333, 389, 427, 440, 441, 443, 444, 445, 465, 500, 501, 514, 515, 520, 546, 548, 554, 555, 559, 591, 594, 623, 625, 631, 660, 666, 777, 800, 801, 802, 806, 807, 808, 809, 811, 812, 815, 819, 832, 843, 873, 885, 960, 964, 990, 993, 1000, 1004, 1005, 1020, 1021, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096,

In the port cell:
0 = no port,
1 = tcp protocol,
2 = udp protocol

## upnp col

Get manufactuer, if possible.

In [7]:
df_train['upnp'][1]

[{'model_name': 'Verizon Media Server',
  'model_description': 'Media Server',
  'manufacturer': 'ARRIS',
  'device_type': 'urn:schemas-upnp-org:device:MediaServer:1',
  'services': ['urn:upnp-org:serviceId:ContentDirectory',
   'urn:upnp-org:serviceId:ConnectionManager']},
 {'model_name': 'VMS1100',
  'model_description': 'Device Management',
  'manufacturer': 'ARRIS',
  'device_type': 'urn:schemas-upnp-org:device:ManageableDevice:2',
  'services': ['urn:upnp-org:serviceId:BasicManagement',
   'urn:upnp-org:serviceId:ConfigurationManagement',
   'urn:upnp-org:serviceId:DeviceProtection']},
 {'model_name': 'VMS1100',
  'model_description': 'DIAL Server',
  'manufacturer': 'ARRIS',
  'device_type': 'urn:schemas-upnp-org:device:dial:1',
  'services': ['urn:upnp-org:serviceId:dial']}]

In [8]:
def get_manufacturer(upnp):
    return upnp[0]['manufacturer'] if isinstance(upnp, list) and 'manufacturer' in upnp[0].keys() else "Unknown"

manufacturers = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    m = get_manufacturer(row['upnp'])
    if m not in manufacturers:
        manufacturers.append(m)
manufacturers.sort()
print(len(manufacturers))
pprint.pprint(manufacturers)

237
[' ',
 '     ',
 ' DUAL ',
 'ABUS Security-Center',
 'ACCESS CO., LTD.',
 'ARCAM',
 'ARRIS',
 'ASUS Corporation',
 'AUNA',
 'AVTECH Corporation.',
 'AXIS',
 'Aastra Telecom Inc',
 'AirBeamTV BV',
 'Amazon',
 'Amlogic Corporation',
 'Arcadyan',
 'Argon Audio',
 'Arris Inc.',
 'Atlantis',
 'Audioblock GmbH',
 'Azureus Software, Inc.',
 'Belkin International Inc.',
 'Belkin International, Inc.',
 'BiglyBT',
 'Bitron Video',
 'BlackBerry',
 'Bose',
 'Bose Corporation',
 'Brickcom Corporation',
 'BridgeCo AG, Switzerland',
 'Broadcom Corporation',
 'Bush',
 'CANON INC.',
 'Cabasse',
 'Cambridge Audio',
 'Canon',
 'Champ',
 'Como Audio',
 'Connected Object',
 'CyberLink Corporation',
 'D-LINK',
 'D-Link',
 'D-Link Corporation',
 'D-Stream',
 'DENON',
 'DIRECTV',
 'DNT',
 'DSG Retail Ltd',
 'DUAL',
 'DVR',
 'Dell',
 'Dell Electronics',
 'Dell Inc.',
 'Denon',
 'Devialet',
 'DivX',
 'Dual',
 'ES',
 'EShare Technology Corporation',
 'EZVIZ',
 'Elgato Systems GmbH',
 'Emby',
 'Freebox',
 'Fr

In [9]:
def get_model(upnp):
    return upnp[0]['model_name'] if isinstance(upnp, list) and 'model_name' in upnp[0].keys() else "Unknown"

models = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    m = get_model(row['upnp'])
    if m not in models:
        models.append(m)
#     else:
#         models[m] += 1
models.sort()
print(len(models))
pprint.pprint(models)

4740
['\t',
 ' ',
 '     ',
 '  Dual IR 2A',
 ' Samsung C406x Series ',
 ' Samsung CLX-8640 8650 Series ',
 ' Samsung CLX-9252 9352 Series ',
 ' Samsung CLX-92x1 93x1 Series ',
 ' Samsung CLX-981x Series ',
 ' Samsung K7600 Series ',
 ' Samsung M408x Series ',
 ' Samsung M4370 5370 Series ',
 ' Samsung M458x Series ',
 ' Samsung M536x Series ',
 ' Samsung SCX-5835_5935X Series ',
 ' Samsung SCX-6545X Series ',
 ' Samsung SCX-6x55X Series ',
 ' Samsung SCX-8030 8040 Series ',
 ' Samsung SCX-8230 8240 Series ',
 ' Samsung X4300 Series ',
 ' Samsung X7600 Series ',
 ' Seagate Personal Cloud',
 ' Seagate Personal Cloud 2 Bay',
 '*AV7701',
 '*AV7702',
 '*AV7702mkII',
 '*AV8802',
 '*AVR-1613',
 '*AVR-1713',
 '*AVR-1913',
 '*AVR-2113',
 '*AVR-2313',
 '*AVR-3313',
 '*AVR-4520',
 '*AVR-E300',
 '*AVR-E400',
 '*AVR-S700W',
 '*AVR-S710W',
 '*AVR-S720W',
 '*AVR-S900W',
 '*AVR-S910W',
 '*AVR-S920W',
 '*AVR-X1000',
 '*AVR-X1100W',
 '*AVR-X1200W',
 '*AVR-X1300W',
 '*AVR-X2000',
 '*AVR-X2100W',
 '*AVR-

 'KDL-50W650A',
 'KDL-50W655A',
 'KDL-50W656A',
 'KDL-50W685A',
 'KDL-50W700A',
 'KDL-50W700B',
 'KDL-50W704A',
 'KDL-50W705B',
 'KDL-50W706B',
 'KDL-50W755C',
 'KDL-50W756C',
 'KDL-50W790B',
 'KDL-50W800B',
 'KDL-50W800C',
 'KDL-50W805B',
 'KDL-50W805C',
 'KDL-50W807B',
 'KDL-50W807C',
 'KDL-50W808C',
 'KDL-50W809C',
 'KDL-50W815B',
 'KDL-50W828B',
 'KDL-50W829B',
 'KDL-50W900B',
 'KDL-52LX905',
 'KDL-52W5500     ',
 'KDL-55EX505',
 'KDL-55EX710',
 'KDL-55EX720',
 'KDL-55EX723',
 'KDL-55EX725',
 'KDL-55HX729',
 'KDL-55HX750',
 'KDL-55HX755',
 'KDL-55HX820',
 'KDL-55HX850',
 'KDL-55HX853',
 'KDL-55HX920',
 'KDL-55NX720',
 'KDL-55NX725',
 'KDL-55NX810',
 'KDL-55W700B',
 'KDL-55W755C',
 'KDL-55W756C',
 'KDL-55W790B',
 'KDL-55W800B',
 'KDL-55W800C',
 'KDL-55W802A',
 'KDL-55W805A',
 'KDL-55W805B',
 'KDL-55W805C',
 'KDL-55W807A',
 'KDL-55W808A',
 'KDL-55W808C',
 'KDL-55W809C',
 'KDL-55W815B',
 'KDL-55W817B',
 'KDL-55W828B',
 'KDL-55W829B',
 'KDL-55W900A',
 'KDL-55W905A',
 'KDL-55W920A',
 'K

 'UA55NU7100',
 'UA55NU8000',
 'UA55NU8500',
 'UA60J6200',
 'UA65KU7000',
 'UA65MU6100',
 'UA65MU7000',
 'UA78JU7500',
 'UBD-M7500',
 'UBD-M8500',
 'UE22H5600',
 'UE32D5500',
 'UE32D6100',
 'UE32D6500',
 'UE32EH5300',
 'UE32EH5307',
 'UE32ES5500',
 'UE32ES6100',
 'UE32ES6300',
 'UE32ES6307',
 'UE32ES6500',
 'UE32ES6580',
 'UE32ES6710',
 'UE32F4500',
 'UE32F5300',
 'UE32F5500',
 'UE32F6330',
 'UE32F6400',
 'UE32F6510',
 'UE32H4500',
 'UE32H5303',
 'UE32H5500',
 'UE32H6200',
 'UE32H6410',
 'UE32J4500',
 'UE32J4570',
 'UE32J5200',
 'UE32J5500',
 'UE32K5500',
 'UE32K5572',
 'UE32M5500',
 'UE32M5503',
 'UE32M5505',
 'UE32M5520',
 'UE32M5522',
 'UE32M5550',
 'UE32M5590',
 'UE32M5602',
 'UE32M5672',
 'UE37ES5500',
 'UE37ES6300',
 'UE37ES6307',
 'UE39F5500',
 'UE40D5000',
 'UE40D5500',
 'UE40D5700',
 'UE40D6100',
 'UE40D6500',
 'UE40D7000',
 'UE40D8000',
 'UE40EH5300',
 'UE40ES5500',
 'UE40ES5507',
 'UE40ES5537',
 'UE40ES5700',
 'UE40ES6100',
 'UE40ES6200',
 'UE40ES6300',
 'UE40ES6307',
 'UE40

In [10]:
def get_premodel(upnp):
    return upnp[0]['model_name'].split(' ')[0] if isinstance(upnp, list) and 'model_name' in upnp[0].keys() else "Unknown"

premodels = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    m = get_premodel(row['upnp'])
    if m not in premodels:
        premodels.append(m)
#     else:
#         premodels[m] += 1
premodels.sort()
print(len(premodels))
pprint.pprint(premodels)

3524
['',
 '\t',
 '*AV7701',
 '*AV7702',
 '*AV7702mkII',
 '*AV8802',
 '*AVR-1613',
 '*AVR-1713',
 '*AVR-1913',
 '*AVR-2113',
 '*AVR-2313',
 '*AVR-3313',
 '*AVR-4520',
 '*AVR-E300',
 '*AVR-E400',
 '*AVR-S700W',
 '*AVR-S710W',
 '*AVR-S720W',
 '*AVR-S900W',
 '*AVR-S910W',
 '*AVR-S920W',
 '*AVR-X1000',
 '*AVR-X1100W',
 '*AVR-X1200W',
 '*AVR-X1300W',
 '*AVR-X2000',
 '*AVR-X2100W',
 '*AVR-X2200W',
 '*AVR-X2300W',
 '*AVR-X3000',
 '*AVR-X3100W',
 '*AVR-X3200W',
 '*AVR-X3300W',
 '*AVR-X4000',
 '*AVR-X4100W',
 '*AVR-X4200W',
 '*AVR-X5200W',
 '*AVR-X6200W',
 '*AVR-X7200W',
 '*DNP-730',
 '*DRA-N4',
 '*M-CR510',
 '*M-CR511',
 '*M-CR610',
 '*M-CR611',
 '*MX122',
 '*NA6005',
 '*NA8005',
 '*NR1504',
 '*NR1506',
 '*NR1603',
 '*NR1604',
 '*NR1605',
 '*NR1606',
 '*NR1607',
 '*RCD-N9',
 '*SR5007',
 '*SR5009',
 '*SR5010',
 '*SR5011',
 '*SR6007',
 '*SR6008',
 '*SR6009',
 '*SR6010',
 '*SR6011',
 '*SR7007',
 '*SR7009',
 '*SR7010',
 '-',
 '-------',
 '1.3',
 '192.168.0.53',
 '192.168.1.102',
 '2-Way',
 '2.4GHz

 'SEC30CDA73256F6',
 'SEC30CDA733B665',
 'SEC30CDA733FED1',
 'SEC30CDA734F32C',
 'SEC30CDA7362A69',
 'SEC30CDA7364A9D',
 'SEC30CDA736A4E2',
 'SEC30CDA736E5EB',
 'SEC30CDA736F88A',
 'SEC30CDA737D6D1',
 'SEC30CDA738208D',
 'SEC30CDA738912D',
 'SEC30CDA738D743',
 'SEC30CDA738E7B1',
 'SEC30CDA739231D',
 'SEC30CDA73934A4',
 'SEC30CDA73A6129',
 'SEC30CDA73B2DBE',
 'SEC30CDA73B845C',
 'SEC30CDA73C1FF3',
 'SEC30CDA73C623C',
 'SEC30CDA73CEBA7',
 'SEC30CDA73D4B96',
 'SEC30CDA73DAE3F',
 'SEC30CDA73DCD90',
 'SEC30CDA75D2DAC',
 'SEC30CDA75D437B',
 'SEC30CDA75D4BBF',
 'SEC30CDA75D548E',
 'SEC30CDA75D6E3C',
 'SEC30CDA75D8D4C',
 'SEC30CDA75D93FC',
 'SEC30CDA75D9950',
 'SEC30CDA75DED91',
 'SEC30CDA75E11EE',
 'SEC30CDA75E1FC4',
 'SEC30CDA75E4107',
 'SEC30CDA75E84D4',
 'SEC30CDA75EBCB2',
 'SEC30CDA75ED239',
 'SEC30CDA763BE68',
 'SEC30CDA76489B3',
 'SEC30CDA764949D',
 'SEC30CDA764BBD0',
 'SEC30CDA764CC56',
 'SEC30CDA764F65A',
 'SEC30CDA76503B9',
 'SEC30CDA7651E07',
 'SEC30CDA7652073',
 'SEC30CDA76539BF',


 'XBR-65X850D',
 'XBR-65X850E',
 'XBR-65X900E',
 'XBR-65X900F',
 'XBR-65X905E',
 'XBR-65X905F',
 'XBR-65X930D',
 'XBR-65Z9D',
 'XBR-75X850D',
 'XBR-75X850E',
 'XBR-75X850F',
 'XBR-75X855D',
 'XBR-75X900E',
 'XBR-75X900F',
 'XBR-75X905E',
 'XBR-75X940C',
 'XBR-85X850D',
 'XDA-QS5400',
 'XEROX_SEGURANCA',
 'XRX0000AA97BCEB',
 'XRX0000AA97F811',
 'XRX0000AAB9DCC0',
 'XRX0000AABC5A2C',
 'XRX0000AABD7C63',
 'XRX0000AABEDFD6',
 'XRX0000AABF5788',
 'XRX0000AAC07E03',
 'XRX0000AAC95DB7',
 'XRX0000AAC964B1',
 'XRX0000AACD7033',
 'XRX0000AACD88C1',
 'XRX0000AACEE912',
 'XRX0000AAD8E2CF',
 'XRX0000AAD8EA0E',
 'XRX0000AAD95507',
 'XRX0000AADB9DDD',
 'XRX0000AADBC750',
 'XRX0000AADBC995',
 'XRX0000AADBE2C5',
 'XRX0000AADE327B',
 'XRX0000AAF2A444',
 'XRX3315-979',
 'XRX9C934E08414C',
 'XRX9C934E088031',
 'XRX9C934E08AE4D',
 'XRX9C934E08EB1E',
 'XRX9C934E093F8B',
 'XRX9C934E09B732',
 'XRX9C934E09D4FD',
 'XRX9C934E0A6D22',
 'XRX9C934E17B655',
 'XRX9C934E1801BB',
 'XRX9C934E189A48',
 'XRX9C934E212145',

In [11]:
def get_upnp_services(upnp):
    if isinstance(upnp, list) and 'services' in upnp[0].keys():
        ss = []
        for d in upnp:
            ss += [x.split(':')[-1] for x in d['services']] 
        return list(filter(lambda x: not x.isdigit(), ss))
    return ["Unknown"]

upnp_services = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    xs = get_upnp_services(row['upnp'])
    for x in xs:
        if x not in upnp_services:
            upnp_services.append(x)
#             upnp_services[x] = 1
#         else:
#             upnp_services[x] += 1
upnp_services.sort()
print(len(upnp_services))
pprint.pprint(upnp_services)

148
['',
 '(null)',
 'ABControl',
 'ACT',
 'AVTransport',
 'AccessControlServer',
 'AlarmClock',
 'AudioIn',
 'BasicManagement',
 'BasicServiceId',
 'CDS_0-99',
 'CMGR_0-99',
 'ConfigService',
 'ConfigurationManagement',
 'ConnectionManager',
 'ConnectionManager1',
 'ConnectionManager_E114A616-9B1B-4904-B2D4-A1DA02E4C439',
 'ContentDirectory',
 'ContentDirectory1',
 'DPSConnectionManager',
 'DeviceProperties',
 'DeviceProtection',
 'DigitalSecurityCameraSettings',
 'DigitalSecurityCameraStillImage',
 'DiscoveryService',
 'EmbeddedNetDeviceControl',
 'EmptyService',
 'ErrorHandler',
 'EventLogService',
 'FileTransfer',
 'Group',
 'GroupControl',
 'GroupManagement',
 'GroupRenderingControl',
 'GsensorControlServer',
 'HTControl',
 'HisenseLanControl',
 'INETTV',
 'IPChange',
 'IPControlService',
 'IRCC',
 'L3Forwarding1',
 'LG_SIGNAGE',
 'LibraryServer',
 'LnvConnectService1.0',
 'Log',
 'MainTVAgent2',
 'MediaControl1',
 'MirrorControlServer',
 'Monitor',
 'MultiChannel',
 'MultiScreenS

## mdns_services col

In [12]:
services = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    if not isinstance(row['mdns_services'], list): continue
    for s in row['mdns_services']:
        if s not in services:
            services.append(s)
services.sort()
print(len(services))
pprint.pprint(services)

445
['.',
 '_13230._tcp.local.',
 '_176562606B454C0._tcp.local.',
 '_1password4._tcp.local.',
 '_501SVC02VC0TPJ1GN._tcp.local.',
 '_501SVC07SH21CG1HW._tcp.local.',
 '_AcctsCfgSrvc._tcp.local.',
 '_Astropad._tcp.local.',
 '_CGI._tcp.local.',
 '_DistantRouter_asy._tcp.local.',
 '_ExtensisSN4._tcp.local.',
 '_FullControlServer._tcp.local.',
 '_HippoRemote._tcp.local.',
 '_JB2XiOSApp._tcp.local.',
 '_JBRemoteServer._tcp.local.',
 '_JUDO-dp._tcp.local.',
 '_JeppFDRouteXfr._tcp.local.',
 '_KeynoteControl._tcp.local.',
 '_LifeLineDevice._tcp.local.',
 '_MacRCServer._tcp.local.',
 '_Naim-Updater._tcp.local.',
 '_OZOmniFocus2._udp.local.',
 '_OZOmniFocus3._udp.local.',
 '_PRO17MP2W._tcp.local.',
 '_SSC02VC0TPJ1GN._tcp.local.',
 '_SSC07SH21CG1HW._tcp.local.',
 '_STVADevSettings._tcp.local.',
 '_SVC02VC0TPJ1GN._tcp.local.',
 '_SVC07SH21CG1HW._tcp.local.',
 '_WECB_AEI._tcp.local.',
 '_aastra-cfg._tcp.local.',
 '_acdseems._tcp.local.',
 '_acer._tcp.local.',
 '_acestreamcast._tcp.local.',
 '_aci._tc

We will need to put similar name companies into one.

## dhcp col

In [13]:
dhcps = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    if not isinstance(row['dhcp'], list) or 'paramlist' not in row['dhcp'][0]: continue
    for p in row['dhcp']:
        for n in p['paramlist'].split(','):
            if n not in dhcps:
                dhcps.append(n)
dhcps = list(map(lambda x: 'p' + x, dhcps))
dhcps.sort()
print(len(dhcps))
pprint.pprint(dhcps)

108
['p1',
 'p10',
 'p100',
 'p101',
 'p11',
 'p119',
 'p12',
 'p120',
 'p121',
 'p125',
 'p128',
 'p129',
 'p13',
 'p130',
 'p131',
 'p132',
 'p133',
 'p134',
 'p135',
 'p14',
 'p142',
 'p143',
 'p144',
 'p15',
 'p150',
 'p151',
 'p153',
 'p154',
 'p156',
 'p157',
 'p159',
 'p16',
 'p160',
 'p161',
 'p17',
 'p170',
 'p18',
 'p19',
 'p190',
 'p191',
 'p2',
 'p20',
 'p204',
 'p21',
 'p22',
 'p23',
 'p24',
 'p244',
 'p245',
 'p249',
 'p25',
 'p252',
 'p255',
 'p26',
 'p27',
 'p28',
 'p29',
 'p3',
 'p30',
 'p31',
 'p32',
 'p33',
 'p34',
 'p35',
 'p36',
 'p37',
 'p38',
 'p39',
 'p4',
 'p40',
 'p41',
 'p42',
 'p43',
 'p44',
 'p45',
 'p46',
 'p47',
 'p48',
 'p49',
 'p5',
 'p50',
 'p51',
 'p52',
 'p53',
 'p54',
 'p55',
 'p56',
 'p57',
 'p58',
 'p59',
 'p6',
 'p60',
 'p61',
 'p66',
 'p67',
 'p69',
 'p7',
 'p70',
 'p72',
 'p78',
 'p79',
 'p8',
 'p81',
 'p88',
 'p9',
 'p95',
 'p97',
 'p99']


## First half of MAC (HW manufacturer)

In [14]:
def get_mac(mac):
    return mac[:8]

macs = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    m = get_mac(row['mac'])
    if m not in macs:
        macs.append(m)
macs.sort()
print(len(macs))
pprint.pprint(macs)

2890
['00:00:00',
 '00:00:48',
 '00:00:74',
 '00:00:85',
 '00:00:aa',
 '00:00:c0',
 '00:00:f0',
 '00:01:db',
 '00:01:e6',
 '00:02:9b',
 '00:02:fd',
 '00:04:00',
 '00:04:4b',
 '00:04:f2',
 '00:05:cd',
 '00:06:78',
 '00:07:25',
 '00:08:02',
 '00:08:5d',
 '00:08:9b',
 '00:09:b0',
 '00:09:df',
 '00:0a:eb',
 '00:0c:76',
 '00:0c:8a',
 '00:0d:4b',
 '00:0d:88',
 '00:0e:2e',
 '00:0e:58',
 '00:0e:7f',
 '00:0f:ea',
 '00:10:19',
 '00:10:75',
 '00:10:c6',
 '00:11:09',
 '00:11:0a',
 '00:11:24',
 '00:11:2f',
 '00:11:32',
 '00:11:43',
 '00:11:5b',
 '00:11:a9',
 '00:11:f6',
 '00:12:3f',
 '00:12:ab',
 '00:12:fb',
 '00:13:02',
 '00:13:20',
 '00:13:21',
 '00:13:46',
 '00:13:72',
 '00:13:77',
 '00:13:8f',
 '00:13:d3',
 '00:13:e8',
 '00:14:22',
 '00:14:2a',
 '00:14:38',
 '00:14:51',
 '00:14:78',
 '00:14:85',
 '00:14:c2',
 '00:14:d1',
 '00:14:ee',
 '00:14:fd',
 '00:15:00',
 '00:15:0d',
 '00:15:17',
 '00:15:5d',
 '00:15:60',
 '00:15:65',
 '00:15:94',
 '00:15:99',
 '00:15:af',
 '00:15:c5',
 '00:15:f2',
 '00:16

 '84:25:19',
 '84:29:99',
 '84:2b:2b',
 '84:2e:27',
 '84:34:97',
 '84:38:35',
 '84:39:be',
 '84:3a:4b',
 '84:41:67',
 '84:4b:f5',
 '84:61:a0',
 '84:78:8b',
 '84:7b:eb',
 '84:80:2d',
 '84:85:06',
 '84:89:ad',
 '84:8e:0c',
 '84:8f:69',
 '84:98:66',
 '84:9c:a6',
 '84:9f:b5',
 '84:a1:34',
 '84:a1:d1',
 '84:a4:66',
 '84:a6:c8',
 '84:a9:3e',
 '84:b5:41',
 '84:ba:3b',
 '84:be:52',
 '84:c0:ef',
 '84:c9:b2',
 '84:d6:d0',
 '84:ef:18',
 '84:f3:eb',
 '84:fc:ac',
 '86:16:f9',
 '86:4b:f5',
 '88:03:55',
 '88:07:4b',
 '88:11:96',
 '88:19:08',
 '88:1f:a1',
 '88:28:b3',
 '88:30:8a',
 '88:36:5f',
 '88:3d:24',
 '88:43:e1',
 '88:44:77',
 '88:51:fb',
 '88:53:2e',
 '88:53:95',
 '88:63:df',
 '88:66:a5',
 '88:6b:6e',
 '88:71:b1',
 '88:71:e5',
 '88:75:98',
 '88:78:73',
 '88:79:7e',
 '88:82:79',
 '88:83:22',
 '88:87:17',
 '88:9f:6f',
 '88:9f:fa',
 '88:ad:d2',
 '88:ae:07',
 '88:ae:1d',
 '88:b1:11',
 '88:b4:a6',
 '88:bd:45',
 '88:c9:d0',
 '88:cb:87',
 '88:d5:0c',
 '88:d7:f6',
 '88:de:a9',
 '88:e8:7f',
 '88:e9:fe',

## ssdp col

In [15]:
ssdp_nts = []
for _, row in pd.concat([df_train, df_test], axis=0, sort=False).iterrows():
    if not isinstance(row['ssdp'], list): continue
    for d in row['ssdp']:
        if 'nt' not in d: continue
        nts = d['nt'].split(':')
        if nts[0] != 'urn' and nts[0] != 'upnp': continue
        for nt in nts:
            if nt.replace('.','',1).isdigit(): continue
            if 's' + nt not in ssdp_nts:
                ssdp_nts.append('s' + nt)
ssdp_nts.sort()
print(len(ssdp_nts))
pprint.pprint(ssdp_nts)

394
['s',
 'sABControl',
 'sACT',
 'sACT-Denon',
 'sAVTransport',
 'sAccessControlServer',
 'sAiosDevice',
 'sAiosServices',
 'sAirBorne',
 'sAlarmClock',
 'sAlphanetworks',
 'sAquosIPC',
 'sAudioIn',
 'sBANAS',
 'sBasic',
 'sBasicManagement',
 'sBasicService',
 'sBboxTV',
 'sBelkin',
 'sBinaryLight',
 'sBroadCaster',
 'sCAService',
 'sCameraConnectedMobileService',
 'sChangeIP',
 'sClientImageManager',
 'sClientOverride',
 'sConfigDevice',
 'sConfigService',
 'sConfigurationManagement',
 'sConnectionMalager',
 'sConnectionManager',
 'sContentDirectory',
 'sD-LinkCamera-8100lh',
 'sDCS-2100',
 'sDCS-3220',
 'sDCS-3420',
 'sDCS-5300',
 'sDCS-5300G',
 'sDCS-8200LH',
 'sDCS-935L',
 'sDCS-935LH',
 'sDCS-960L',
 'sDPSConnectionManager',
 'sDPSPrinter',
 'sDVR',
 'sDeviceProperties',
 'sDeviceProtection',
 'sDigitalSecurityCamera',
 'sDigitalSecurityCameraSettings',
 'sDigitalSecurityCameraStillImage',
 'sDiscoveryService',
 'sEmbeddedNetDevice',
 'sEmbeddedNetDeviceControl',
 'sEmptyService

# Feature preprocessing

Prepare data for ML

In [16]:
le = preprocessing.LabelEncoder()
classes = list(set(df_train['device_class']))
encoded_classes = le.fit_transform(classes)
classes_indexes = dict(zip(classes, encoded_classes))
index_classes = dict(zip(encoded_classes, classes))
pprint.pprint(classes_indexes)

{'AUDIO': 0,
 'GAME_CONSOLE': 1,
 'GENERIC_IOT': 2,
 'HOME_AUTOMATION': 3,
 'IP_PHONE': 4,
 'MEDIA_BOX': 5,
 'MOBILE': 6,
 'NAS': 7,
 'PC': 8,
 'PRINTER': 9,
 'SURVEILLANCE': 10,
 'TV': 11,
 'VOICE_ASSISTANT': 12}


Process input data into df.

## Data to vectors

In [17]:
def protocol2index(protocol):
    if protocol == 'tcp': return 1
    elif protocol == 'udp': return 2
    else: return 0

def get_ports(services):
    return list(map(lambda x: (x['port'], protocol2index(x['protocol'])), services)) if isinstance(services, list) else []

# def get_manufacturer()
# def get_model()
# def get_premodel()
# def get_upnp_services()

def get_services(mdns_services):
    return mdns_services if isinstance(mdns_services, list) else []

def get_dhcps(dhcp):
    return list(map(lambda x: 'p' + x, dhcp[0]['paramlist'].split(','))) if isinstance(dhcp, list) and 'paramlist' in dhcp[0] else []

# def get_mac()

def get_ssdp_nt(ssdp):
    if not isinstance(ssdp, list): return []
    ret_nts = []
    for d in ssdp:
        if 'nt' not in d: continue
        nts = d['nt'].split(':')
        if nts[0] != 'urn' and nts[0] != 'upnp': continue
        for nt in nts:
            if nt.replace('.','',1).isdigit(): continue
            if 's' + nt not in ret_nts:
                ret_nts.append('s' + nt)
    return ret_nts

# def get_upnp_services()

mac2index = dict(zip(macs, le.fit_transform(macs)))
manufacturer2index = dict(zip(manufacturers, le.fit_transform(manufacturers)))
model2index = dict(zip(models, le.fit_transform(models)))
premodel2index = dict(zip(premodels, le.fit_transform(premodels)))

In [18]:
def input2df(input_df, ports, manufacturers, services, macs, dhcps, models, premodels, ssdp_nts, upnp_services, is_train=True):
    t_start = time.time()
    col_names = ports + ['manufacturer'] + services + dhcps + ['mac'] + ['model'] + ['premodel'] + ssdp_nts + upnp_services
#     col_names = ports + services + ['mac'] + ['model'] + ['premodel'] + upnp_services
#     col_names = ports + ['manufacturer'] + services + ['mac'] + ['model'] + ['premodel'] + upnp_services
    if is_train:
        col_names += ['CLASS']
    col2index_dict = dict(zip(col_names, range(len(col_names))))
    in_dict = dict((k, []) for k in col_names)

#     c = 0

    for _, row in input_df.iterrows():
#         if(c == 7000): break
        new_row = np.zeros(len(col_names))

        curr_ports = get_ports(row['services'])
        curr_manufacturer = get_manufacturer(row['upnp'])
        curr_services = get_services(row['mdns_services'])
        curr_mac = get_mac(row['mac'])
        curr_dhcp = get_dhcps(row['dhcp'])
        curr_model = get_model(row['upnp'])
        curr_premodel = get_premodel(row['upnp'])
        curr_ssdp_nts = get_ssdp_nt(row['ssdp'])
        curr_upnp_services = get_upnp_services(row['upnp'])

        for p, val in curr_ports:
            if p in col2index_dict.keys():
                new_row[col2index_dict[p]] = val
        if curr_manufacturer in manufacturer2index.keys():
            new_row[col2index_dict['manufacturer']] = manufacturer2index[curr_manufacturer]
#             new_row[col2index_dict[curr_manufacturer]] = 1
        for s in curr_services:
            if s in col2index_dict.keys():
                new_row[col2index_dict[s]] = 1
        if curr_mac in mac2index.keys():
            new_row[col2index_dict['mac']] = mac2index[curr_mac]
        for p in curr_dhcp:
            if p in col2index_dict.keys():
                new_row[col2index_dict[p]] = 1
        if curr_model in model2index.keys():
            new_row[col2index_dict['model']] = model2index[curr_model]
        if curr_premodel in premodel2index.keys():
            new_row[col2index_dict['premodel']] = premodel2index[curr_premodel]
        for nt in curr_ssdp_nts:
            if nt in col2index_dict.keys():
                new_row[col2index_dict[nt]] = 1
        for us in curr_upnp_services:
            if us in col2index_dict.keys():
                new_row[col2index_dict[us]] = 1
        if is_train:
            new_row[-1] = classes_indexes[row['device_class']]

        for n, k in enumerate(in_dict.keys()):
            in_dict[k].append(new_row[n])

#         c += 1
    
    output = pd.DataFrame(in_dict)
    print("Seconds: {}".format(round(time.time() - t_start)))
    return output

In [19]:
get_ports(df_train['services'][0])

[(80, 1), (443, 1), (1434, 2), (5060, 1), (5351, 2), (5353, 2), (5683, 2)]

In [20]:
get_manufacturer(df_train['upnp'][2])

'Denon'

In [21]:
get_services(df_train['mdns_services'][2])

['_airplay._tcp.local.',
 '_spotify-connect._tcp.local.',
 '_raop._tcp.local.',
 '_http._tcp.local.']

In [22]:
get_mac(df_train['mac'][0])

'80:5e:c0'

In [23]:
get_dhcps(df_train['dhcp'][4])

['p1',
 'p28',
 'p2',
 'p3',
 'p15',
 'p6',
 'p119',
 'p12',
 'p44',
 'p47',
 'p26',
 'p121',
 'p42']

In [24]:
get_ssdp_nt(df_train['ssdp'][1])

['supnp',
 'srootdevice',
 'surn',
 'sschemas-upnp-org',
 'sservice',
 'sBasicManagement',
 'sdevice',
 'sManageableDevice',
 'sDeviceProtection',
 'sConfigurationManagement',
 'sMediaServer',
 'sConnectionManager',
 'sContentDirectory']

In [25]:
get_upnp_services(df_train['upnp'][1])

['ContentDirectory',
 'ConnectionManager',
 'BasicManagement',
 'ConfigurationManagement',
 'DeviceProtection',
 'dial']

## Splitting data

In [26]:
df = input2df(df_train, ports, manufacturers, services, macs, dhcps, models, premodels, ssdp_nts, upnp_services)

Seconds: 3638


In [27]:
# A) Use train data only
# X_train, X_test, y_train, y_test = train_test_split(df.drop('CLASS', axis=1), df['CLASS'], test_size=0.2)

# B) Perform regular ML
X_train = df.drop('CLASS', axis=1)
y_train = df['CLASS']

# Clasification method

## Method experiments

(In case A)

In [24]:
t_start = time.time()
knn = KNeighborsClassifier(n_neighbors=10, n_jobs=4, p=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.8866666666666667
Time: 11.554763555526733


In [25]:
t_start = time.time()
knn = KNeighborsClassifier(n_neighbors=10, n_jobs=4, p=2)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.8683333333333333
Time: 12.049591302871704


In [102]:
t_start = time.time()
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)



F1-score: 0.8975
Time: 2.624030590057373


In [27]:
t_start = time.time()
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)



F1-score: 0.24333333333333335
Time: 411.25162148475647


In [28]:
t_start = time.time()
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_pred = gaussian.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.7716666666666666
Time: 2.0739998817443848


In [103]:
t_start = time.time()
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.969
Time: 27.08111023902893


In [32]:
# for n in range(100, 410, 10):
#     t_start = time.time()
#     random_forest = RandomForestClassifier(n_estimators=n)
#     random_forest.fit(X_train, y_train)
#     y_pred = random_forest.predict(X_test)
#     t_end = time.time()
#     print("n =", n)
#     print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
#     print("Time:", t_end - t_start, "\n")
    
#     t_start = time.time()
#     random_forest = RandomForestClassifier(n_estimators=n, max_features=None)
#     random_forest.fit(X_train, y_train)
#     y_pred = random_forest.predict(X_test)
#     t_end = time.time()
#     print("n =", n)
#     print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
#     print("Time:", t_end - t_start, "\n")

## Chosen method application

**(In case B)**

In [54]:
X_test = input2df(df_test, ports, manufacturers, services, macs, dhcps, models, premodels, ssdp_nts, upnp_services, is_train=False)

Seconds: 5532


# Train data eval


In [58]:
classifier = RandomForestClassifier(n_estimators=50, n_jobs=-1)
# classifier = LogisticRegression(n_jobs=-1)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [59]:
y_pred = classifier.predict(X_test)

In [60]:
pred = list(map(lambda x: index_classes[x], y_pred))
output_df = pd.DataFrame({'Id': df_test['device_id'], 'Predicted': pred})
output_df

Unnamed: 0,Id,Predicted
0,a3529c63-8d5f-4a90-85f0-33a10b9c5ca4,MOBILE
1,addb3142-6b4a-4aef-9d00-ce7ab250c05c,GAME_CONSOLE
2,939102ad-730b-48e5-8f36-438a68192aa9,TV
3,e8b4c2b1-8820-475d-8f75-0f1dda325539,TV
4,6e7e5101-9f82-45c8-a0f0-17f58f77d82c,HOME_AUTOMATION
5,89197990-6549-4463-89e0-929c3d8c988a,TV
6,655699d3-e035-4dcd-b5be-4d55280ac41b,TV
7,1cbfb119-74b0-488e-99fe-6f4aac53248a,IP_PHONE
8,1a5d561a-2e87-4e0f-8610-b7c39293f169,GAME_CONSOLE
9,9e574f6a-8a96-48e8-b3fd-1da1f0c2965c,GAME_CONSOLE


In [61]:
output_df.to_csv('submission.csv', sep=',', index=False)