In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import math
import scipy.stats as st
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib
import seaborn as sns

import copy
import os
import pprint
import time

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Starter functions

# Load data

In [2]:
df_train = pd.read_json(os.path.join('dataset', 'train.json'), lines=True)

In [3]:
df_test = pd.read_json(os.path.join('dataset', 'test.json'), lines=True)

In [3]:
df_train

Unnamed: 0,device_class,device_id,dhcp,ip,mac,mdns_services,services,ssdp,upnp
0,IP_PHONE,5347ada9-925c-400e-8a7c-9aedd3c142f6,,192.168.0.147,80:5e:c0:41:ad:39,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...",,
1,MEDIA_BOX,2717684b-3937-4644-a33a-33f4226c43ec,,192.168.1.109,44:e1:37:a2:ec:c1,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...","[{'st': '', 'nt': 'upnp:rootdevice', 'location...","[{'model_name': 'Verizon Media Server', 'model..."
2,AUDIO,f1fc42f4-c794-4cc5-ac13-a5097d722d92,,192.168.1.121,00:00:00:00:00:00,"[_airplay._tcp.local., _spotify-connect._tcp.l...","[{'port': 80, 'protocol': 'tcp'}, {'port': 80,...","[{'st': '', 'nt': 'urn:schemas-upnp-org:servic...","[{'model_name': 'Denon AVR-X2400H', 'manufactu..."
3,GAME_CONSOLE,74ab1b5b-3cb6-4aee-8362-f3b2f016574c,,192.168.0.7,40:99:22:41:ac:a6,[_spotify-connect._tcp.local.],"[{'port': 5353, 'protocol': 'udp'}, {'port': 4...",,
4,GENERIC_IOT,1fe43d89-329d-40fe-b948-d1cbe0fe6c96,"[{'paramlist': '1,28,2,3,15,6,119,12,44,47,26,...",192.168.15.26,00:07:25:15:3a:fb,,,,
5,VOICE_ASSISTANT,8bb8971c-5983-4baa-9753-f0ac21faf162,,192.168.1.80,ac:63:be:a5:50:43,"[_workstation._tcp.local., _ssh._tcp.local., _...","[{'port': 22, 'protocol': 'tcp'}, {'port': 407...",,
6,AUDIO,0acd15a9-c52a-4980-bf30-a8c8f5042412,,192.168.254.24,00:0e:58:fc:40:60,[_spotify-connect._tcp.local.],"[{'port': 1400, 'protocol': 'tcp'}, {'port': 1...","[{'st': '', 'nt': 'uuid:RINCON_000E58FC4084014...","[{'model_name': 'Sonos Play:3', 'model_descrip..."
7,VOICE_ASSISTANT,6d559ddb-84ac-4e47-83bb-e3bff6e53afe,,192.168.178.29,40:b4:cd:41:f0:cc,"[_workstation._tcp.local., _ssh._tcp.local., _...","[{'port': 22, 'protocol': 'tcp'}, {'port': 407...",,
8,IP_PHONE,c69cc92a-a813-4312-9993-7816994bc04d,,172.26.17.234,00:08:5d:22:fd:c5,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...",,
9,IP_PHONE,db9bdc04-0322-424a-a3d1-57432f1c7326,,172.26.17.238,00:08:5d:22:fd:d2,,"[{'port': 80, 'protocol': 'tcp'}, {'port': 443...",,


# Feature lookup

Interesting features:
- services: list of used ports & protocols
- upnp: dict of device info --> names, types, manufactuer, ...
- mdns_services: can directly point on device type

Examples:

In [35]:
df_train['mdns_services'][2]

['_airplay._tcp.local.',
 '_spotify-connect._tcp.local.',
 '_raop._tcp.local.',
 '_http._tcp.local.']

In [4]:
df_train['services'][27]

[{'port': 135, 'protocol': 'tcp'},
 {'port': 1900, 'protocol': 'udp'},
 {'port': 2869, 'protocol': 'tcp'},
 {'port': 5357, 'protocol': 'tcp'}]

In [5]:
df_train['upnp'][2]

[{'model_name': 'Denon AVR-X2400H',
  'manufacturer': 'Denon',
  'device_type': 'urn:schemas-denon-com:device:AiosDevice:1',
  'services': []},
 {'model_name': 'Denon AVR-X2400H',
  'manufacturer': 'Denon',
  'device_type': 'urn:schemas-upnp-org:device:MediaRenderer:1',
  'services': ['urn:upnp-org:serviceId:AVTransport',
   'urn:upnp-org:serviceId:ConnectionManager',
   'urn:upnp-org:serviceId:RenderingControl']},
 {'model_name': 'Denon AVR-X2400H',
  'manufacturer': 'Denon',
  'device_type': 'urn:schemas-denon-com:device:AiosServices:1',
  'services': ['urn:denon-com:serviceId:ErrorHandler',
   'urn:denon-com:serviceId:ZoneControl',
   'urn:denon-com:serviceId:GroupControl']},
 {'model_name': 'Denon AVR-X2400H',
  'manufacturer': 'Denon',
  'device_type': 'urn:schemas-denon-com:device:ACT-Denon:1',
  'services': ['urn:denon-com:serviceId:ACT']},
 {'model_name': 'Denon AVR-X2400H',
  'model_description': 'Shares User defined folders and files to other Universal Plug and Play media dev

In [6]:
df_train['upnp'][27]

[{'model_name': 'Windows Media Player Sharing',
  'manufacturer': 'Microsoft Corporation',
  'device_type': 'urn:schemas-upnp-org:device:MediaServer:1',
  'services': ['urn:upnp-org:serviceId:ConnectionManager',
   'urn:upnp-org:serviceId:ContentDirectory',
   'urn:microsoft.com:serviceId:X_MS_MediaReceiverRegistrar']}]

In [7]:
df_train['dhcp'][4]

[{'paramlist': '1,28,2,3,15,6,119,12,44,47,26,121,42'}]

In [8]:
df_train['ssdp'][1]

[{'st': '',
  'nt': 'upnp:rootdevice',
  'location': 'http://192.168.1.109:9098/device_description.xml',
  'server': 'ARRIS DIAL/1.7.2 UPnP/1.0 ARRIS Settop Box',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'upnp:rootdevice',
  'location': 'http://192.168.1.109:8091/XD/21e13e66-1dd2-11b2-9b87-44e137a2ec6a',
  'server': 'Allegro-Software-RomPager/5.41 UPnP/1.0 ARRIS Settop Box',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'urn:schemas-upnp-org:service:BasicManagement:2',
  'location': 'http://192.168.1.109:8091/XD/21e13e66-1dd2-11b2-9b87-44e137a2ec6a',
  'server': 'Allegro-Software-RomPager/5.41 UPnP/1.0 ARRIS Settop Box',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'urn:schemas-upnp-org:device:ManageableDevice:2',
  'location': 'http://192.168.1.109:8091/XD/21e13e66-1dd2-11b2-9b87-44e137a2ec6a',
  'server': 'Allegro-Software-RomPager/5.41 UPnP/1.0 ARRIS Settop Box',
  'user_agent': '',
  'method': ''},
 {'st': '',
  'nt': 'uuid:21e13e66-1dd2-11b2-

## services col

Just make cols for each port and protocol.

In [9]:
ports = []
for _, row in df_train.iterrows():
    if not isinstance(row['services'], list): continue
    for d in row['services']:
        if d['port'] not in ports:
            ports.append(d['port'])
ports.sort()
print(len(ports))  # Count of unique ports
print(ports)  # Unique port list

7854
[3, 7, 9, 13, 17, 19, 20, 21, 22, 23, 25, 32, 45, 50, 53, 67, 68, 69, 71, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 96, 99, 101, 110, 111, 123, 135, 137, 138, 139, 161, 162, 182, 200, 202, 256, 311, 389, 427, 440, 443, 445, 465, 500, 514, 515, 520, 546, 548, 554, 555, 559, 623, 625, 631, 777, 800, 801, 811, 819, 832, 873, 960, 964, 990, 993, 1004, 1005, 1021, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1

In the port cell:
0 = no port,
1 = tcp protocol,
2 = udp protocol

## upnp col

Get manufactuer, if possible.

In [10]:
def get_manufacturer(upnp):
    return upnp[0]['manufacturer'] if isinstance(upnp, list) and 'manufacturer' in upnp[0].keys() else "Unknown"

manufacturers = []
for _, row in df_train.iterrows():
    m = get_manufacturer(row['upnp'])
    if m not in manufacturers:
        manufacturers.append(m)
manufacturers.sort()
print(len(manufacturers))
pprint.pprint(manufacturers)

171
[' ',
 '     ',
 'ABUS Security-Center',
 'ACCESS CO., LTD.',
 'ARRIS',
 'AUNA',
 'AXIS',
 'AirBeamTV BV',
 'Amazon',
 'Amlogic Corporation',
 'Arcadyan',
 'Arris Inc.',
 'Azureus Software, Inc.',
 'Belkin International Inc.',
 'Belkin International, Inc.',
 'Bitron Video',
 'BlackBerry',
 'Bose',
 'Bose Corporation',
 'Brickcom Corporation',
 'BridgeCo AG, Switzerland',
 'Broadcom Corporation',
 'CANON INC.',
 'Cabasse',
 'Champ',
 'Connected Object',
 'CyberLink Corporation',
 'D-LINK',
 'D-Link',
 'D-Link Corporation',
 'D-Stream',
 'DENON',
 'DIRECTV',
 'DNT',
 'DUAL',
 'Dell',
 'Dell Electronics',
 'Denon',
 'Dual',
 'ES',
 'EZVIZ',
 'Emby',
 'Freebox',
 'Freebox SAS',
 'Freecom',
 'Freecom Technologies',
 'Frontier Silicon Ltd',
 'Fujitsu Technology Solutions GmbH',
 'Gestetner',
 'Google Inc.',
 'Grundig',
 'HIKVISION',
 'HP',
 'Hama',
 'Hama GmbH & Co KG',
 'Harman Kardon',
 'Hinston Electronics Co., Ltd',
 'Hitachi W100 AllPlay',
 'INSPUR',
 'Integra',
 'Intelbras',
 'JBL'

We will need to put similar name companies into one.

## mdns_services col

In [11]:
services = []
for _, row in df_train.iterrows():
    if not isinstance(row['mdns_services'], list): continue
    for s in row['mdns_services']:
        if s not in services:
            services.append(s)
services.sort()
print(len(services))
pprint.pprint(services)

359
['_13230._tcp.local.',
 '_176562606B454C0._tcp.local.',
 '_1password4._tcp.local.',
 '_501SVC02VC0TPJ1GN._tcp.local.',
 '_AcctsCfgSrvc._tcp.local.',
 '_CGI._tcp.local.',
 '_DistantRouter_asy._tcp.local.',
 '_JB2XiOSApp._tcp.local.',
 '_JBRemoteServer._tcp.local.',
 '_JUDO-dp._tcp.local.',
 '_KeynoteControl._tcp.local.',
 '_LifeLineDevice._tcp.local.',
 '_MacRCServer._tcp.local.',
 '_Naim-Updater._tcp.local.',
 '_OZOmniFocus3._udp.local.',
 '_PRO17MP2W._tcp.local.',
 '_SSC02VC0TPJ1GN._tcp.local.',
 '_SVC02VC0TPJ1GN._tcp.local.',
 '_WECB_AEI._tcp.local.',
 '_acdseems._tcp.local.',
 '_acer._tcp.local.',
 '_acestreamcast._tcp.local.',
 '_aci._tcp.local.',
 '_acp-sync._tcp.local.',
 '_acrobatSRV._tcp.local.',
 '_adb._tcp.local.',
 '_adisk._tcp.local.',
 '_adobe-vc._tcp.local.',
 '_afpovertcp._tcp.1062242935.members.btmm.icloud.com.',
 '_afpovertcp._tcp.1098945996.members.btmm.icloud.com.',
 '_afpovertcp._tcp.local.',
 '_air-keynote._tcp.local.',
 '_airdrop._tcp.local.',
 '_airkan._tcp.l

## dhcp col

In [12]:
dhcps = []
for _, row in df_train.iterrows():
    if not isinstance(row['dhcp'], list) or 'paramlist' not in row['dhcp'][0]: continue
    for p in row['dhcp']:
        for n in p['paramlist'].split(','):
            if n not in dhcps:
                dhcps.append(n)
dhcps = list(map(lambda x: int(x), dhcps))
dhcps.sort()
print(len(dhcps))
pprint.pprint(dhcps)

77
[1,
 2,
 3,
 4,
 5,
 6,
 7,
 9,
 11,
 12,
 13,
 15,
 16,
 17,
 18,
 22,
 23,
 26,
 28,
 29,
 31,
 33,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 54,
 58,
 59,
 60,
 66,
 67,
 69,
 70,
 72,
 78,
 79,
 81,
 88,
 95,
 97,
 100,
 101,
 119,
 120,
 121,
 125,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 142,
 143,
 144,
 150,
 151,
 153,
 154,
 159,
 160,
 161,
 190,
 204,
 249,
 252]


## First half of MAC (HW manufacturer)

In [13]:
def get_mac(mac):
    return mac[:8]

macs = []
for _, row in df_train.iterrows():
    m = get_mac(row['mac'])
    if m not in macs:
        macs.append(m)
macs.sort()
print(len(macs))
pprint.pprint(macs)

1649
['00:00:00',
 '00:00:74',
 '00:00:aa',
 '00:00:f0',
 '00:01:db',
 '00:01:e6',
 '00:02:9b',
 '00:04:00',
 '00:04:4b',
 '00:05:cd',
 '00:06:78',
 '00:07:25',
 '00:08:5d',
 '00:08:9b',
 '00:09:b0',
 '00:0d:4b',
 '00:0d:88',
 '00:0e:2e',
 '00:0e:58',
 '00:0e:7f',
 '00:0f:ea',
 '00:10:19',
 '00:10:75',
 '00:10:c6',
 '00:11:0a',
 '00:11:2f',
 '00:11:32',
 '00:11:f6',
 '00:13:21',
 '00:13:46',
 '00:13:77',
 '00:13:8f',
 '00:14:2a',
 '00:14:38',
 '00:14:51',
 '00:14:78',
 '00:14:85',
 '00:14:c2',
 '00:14:d1',
 '00:14:fd',
 '00:15:5d',
 '00:15:60',
 '00:15:65',
 '00:15:94',
 '00:15:99',
 '00:15:af',
 '00:16:17',
 '00:16:41',
 '00:16:cb',
 '00:16:e6',
 '00:16:ec',
 '00:17:08',
 '00:17:88',
 '00:17:a4',
 '00:17:c8',
 '00:17:f2',
 '00:18:4d',
 '00:18:71',
 '00:18:f3',
 '00:18:fe',
 '00:19:21',
 '00:19:66',
 '00:19:99',
 '00:19:bb',
 '00:19:db',
 '00:19:e0',
 '00:1a:4b',
 '00:1a:4d',
 '00:1a:6b',
 '00:1a:92',
 '00:1b:38',
 '00:1b:63',
 '00:1b:78',
 '00:1b:9e',
 '00:1b:a9',
 '00:1b:b9',
 '00:1b

 'a8:88:08',
 'a8:96:75',
 'a8:9f:ec',
 'a8:a6:48',
 'a8:a7:95',
 'a8:bb:cf',
 'a8:be:27',
 'a8:d3:f7',
 'a8:fa:d8',
 'ac:16:2d',
 'ac:1f:74',
 'ac:22:0b',
 'ac:29:3a',
 'ac:3a:7a',
 'ac:3c:0b',
 'ac:63:be',
 'ac:81:12',
 'ac:84:c6',
 'ac:87:a3',
 'ac:89:95',
 'ac:9b:0a',
 'ac:9e:17',
 'ac:ae:19',
 'ac:af:b9',
 'ac:b5:7d',
 'ac:bc:32',
 'ac:c1:ee',
 'ac:c3:3a',
 'ac:cc:8e',
 'ac:cf:5c',
 'ac:d0:74',
 'ac:d1:b8',
 'ac:e0:10',
 'ac:e2:d3',
 'ac:e4:b5',
 'ae:84:c6',
 'b0:05:94',
 'b0:10:41',
 'b0:19:c6',
 'b0:2a:43',
 'b0:34:95',
 'b0:39:56',
 'b0:48:1a',
 'b0:48:7a',
 'b0:4e:26',
 'b0:52:16',
 'b0:5a:da',
 'b0:6e:bf',
 'b0:70:2d',
 'b0:72:bf',
 'b0:7f:b9',
 'b0:93:5b',
 'b0:98:2b',
 'b0:a7:37',
 'b0:b2:8f',
 'b0:b9:8a',
 'b0:c0:90',
 'b0:ca:68',
 'b0:e2:35',
 'b0:ee:7b',
 'b0:fc:0d',
 'b2:48:1a',
 'b2:4e:26',
 'b2:6e:bf',
 'b2:be:76',
 'b2:c5:54',
 'b4:0e:dc',
 'b4:18:d1',
 'b4:2e:99',
 'b4:74:9f',
 'b4:75:0e',
 'b4:7c:9c',
 'b4:82:fe',
 'b4:99:ba',
 'b4:9c:df',
 'b4:a3:82',
 'b4:b5:2f',

## ssdp col

# Feature preprocessing

Prepare data for ML

In [14]:
le = preprocessing.LabelEncoder()
classes = list(set(df_train['device_class']))
encoded_classes = le.fit_transform(classes)
classes_indexes = dict(zip(classes, encoded_classes))
index_classes = dict(zip(encoded_classes, classes))
pprint.pprint(classes_indexes)

{'AUDIO': 0,
 'GAME_CONSOLE': 1,
 'GENERIC_IOT': 2,
 'HOME_AUTOMATION': 3,
 'IP_PHONE': 4,
 'MEDIA_BOX': 5,
 'MOBILE': 6,
 'NAS': 7,
 'PC': 8,
 'PRINTER': 9,
 'SURVEILLANCE': 10,
 'TV': 11,
 'VOICE_ASSISTANT': 12}


Process input data into df.

## Data to vectors

In [15]:
def protocol2index(protocol):
    if protocol == 'tcp': return 1
    elif protocol == 'udp': return 2
    else: return 0

def get_ports(services):
    return list(map(lambda x: (x['port'], protocol2index(x['protocol'])), services)) if isinstance(services, list) else []

# def get_manufacturer()

def get_services(mdns_services):
    return mdns_services if isinstance(mdns_services, list) else []

def get_dhcps(dhcp):
    return list(map(lambda x: int(x), dhcp[0]['paramlist'].split(','))) if isinstance(dhcp, list) and 'paramlist' in dhcp[0] else []

# def get_mac()

mac2index = dict(zip(macs, le.fit_transform(macs)))

In [25]:
def input2df(input_df, ports, manufacturers, services, macs, dhcps, is_train=True):
    col_names = ports + manufacturers + services + dhcps + ['mac']
    if is_train:
        col_names = col_names + ['CLASS']
    col2index_dict = dict(zip(col_names, range(len(col_names))))
    in_dict = dict((k, []) for k in col_names)

#     c = 0

    for _, row in input_df.iterrows():
#         if(c == 10000): break
        new_row = np.zeros(len(col_names))

        curr_ports = get_ports(row['services'])
        curr_manufacturer = get_manufacturer(row['upnp'])
        curr_services = get_services(row['mdns_services'])
        curr_mac = get_mac(row['mac'])
        curr_dhcp = get_dhcps(row['dhcp'])

        for p, val in curr_ports:
            if p in col2index_dict.keys():
                new_row[col2index_dict[p]] = val
        if curr_manufacturer in col2index_dict.keys():
            new_row[col2index_dict[curr_manufacturer]] = 1
        for s in curr_services:
            if s in col2index_dict.keys():
                new_row[col2index_dict[s]] = 1
        if curr_mac in mac2index.keys():
            new_row[col2index_dict['mac']] = mac2index[curr_mac]
        for p in curr_dhcp:
            if p in col2index_dict.keys():
                new_row[col2index_dict[p]] = 1
        if is_train:
            new_row[-1] = classes_indexes[row['device_class']]

        for n, k in enumerate(in_dict.keys()):
            in_dict[k].append(new_row[n])

#         c += 1

    return pd.DataFrame(in_dict)

In [26]:
get_ports(df_train['services'][0])

[(80, 1), (443, 1), (1434, 2), (5060, 1), (5351, 2), (5353, 2), (5683, 2)]

In [27]:
get_manufacturer(df_train['upnp'][2])

'Denon'

In [28]:
get_services(df_train['mdns_services'][2])

['_airplay._tcp.local.',
 '_spotify-connect._tcp.local.',
 '_raop._tcp.local.',
 '_http._tcp.local.']

In [29]:
get_mac(df_train['mac'][0])

'80:5e:c0'

In [30]:
get_dhcps(df_train['dhcp'][4])

[1, 28, 2, 3, 15, 6, 119, 12, 44, 47, 26, 121, 42]

## Splitting data

In [31]:
df = input2df(df_train, ports, manufacturers, services, macs, dhcps)

# A) Use train data only
X_train, X_test, y_train, y_test = train_test_split(df.drop('CLASS', axis=1), df['CLASS'], test_size=0.2)

# B) Perform regular ML
# X_train = df.drop('CLASS', axis=1)
# y_train = df['CLASS']

# Clasification method

## Method experiments

(In case A)

In [24]:
t_start = time.time()
knn = KNeighborsClassifier(n_neighbors=10, n_jobs=4, p=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.8866666666666667
Time: 11.554763555526733


In [25]:
t_start = time.time()
knn = KNeighborsClassifier(n_neighbors=10, n_jobs=4, p=2)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.8683333333333333
Time: 12.049591302871704


In [33]:
t_start = time.time()
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)



F1-score: 0.9803142807805215
Time: 7.177955627441406


In [27]:
t_start = time.time()
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)



F1-score: 0.24333333333333335
Time: 411.25162148475647


In [28]:
t_start = time.time()
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_pred = gaussian.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.7716666666666666
Time: 2.0739998817443848


In [34]:
t_start = time.time()
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
t_end = time.time()
print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
print("Time:", t_end - t_start)

F1-score: 0.9802279399067518
Time: 138.42386674880981


In [32]:
# for n in range(100, 410, 10):
#     t_start = time.time()
#     random_forest = RandomForestClassifier(n_estimators=n)
#     random_forest.fit(X_train, y_train)
#     y_pred = random_forest.predict(X_test)
#     t_end = time.time()
#     print("n =", n)
#     print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
#     print("Time:", t_end - t_start, "\n")
    
#     t_start = time.time()
#     random_forest = RandomForestClassifier(n_estimators=n, max_features=None)
#     random_forest.fit(X_train, y_train)
#     y_pred = random_forest.predict(X_test)
#     t_end = time.time()
#     print("n =", n)
#     print("F1-score:", metrics.f1_score(y_test, y_pred, average='micro'))
#     print("Time:", t_end - t_start, "\n")

## Chosen method application

(In case B)

In [18]:
classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

# Train data eval

In [19]:
X_test = input2df(df_test, ports, manufacturers, services, is_train=False)

In [20]:
y_pred = classifier.predict(X_test)

In [21]:
output_df = pd.DataFrame({'Id': df_test['device_id'], 'Predicted': list(map(lambda x: index_classes[x], y_pred))})
output_df

Unnamed: 0,Id,Predicted
0,a3529c63-8d5f-4a90-85f0-33a10b9c5ca4,GENERIC_IOT
1,addb3142-6b4a-4aef-9d00-ce7ab250c05c,GENERIC_IOT
2,939102ad-730b-48e5-8f36-438a68192aa9,SURVEILLANCE
3,e8b4c2b1-8820-475d-8f75-0f1dda325539,TV
4,6e7e5101-9f82-45c8-a0f0-17f58f77d82c,HOME_AUTOMATION
5,89197990-6549-4463-89e0-929c3d8c988a,TV
6,655699d3-e035-4dcd-b5be-4d55280ac41b,TV
7,1cbfb119-74b0-488e-99fe-6f4aac53248a,IP_PHONE
8,1a5d561a-2e87-4e0f-8610-b7c39293f169,GAME_CONSOLE
9,9e574f6a-8a96-48e8-b3fd-1da1f0c2965c,GENERIC_IOT


In [22]:
output_df.to_csv('submission.csv', sep=',', index=False)