In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [52]:
labels_df = pd.read_parquet('./clean_device_names.parquet').fillna('')
dns_df = pd.read_csv('./dns.csv').fillna('')
feature_df = pd.read_csv('./device.csv').fillna('')
port_df = pd.read_csv('./syn_scan_ports.csv')

In [53]:
# make testing csv file
feature_df.head(1000).to_csv('device_test.csv')

### Data Analysis
* sample number
* feature analysis
  * how many classes
  * how many samples that's not `null`

In [54]:
feature_df.head(5)

Unnamed: 0,device_id,device_name,device_type,device_vendor,device_oui,dhcp_hostname,netdisco_device_info,user_key
0,s2f7a763479,Pixel 2,Phone,Google,HTC Corporation,,{},269aa83b6b844cf11be44d6dbea28868
1,s069dfcecf1,ZP80,Music server,Sonos,"Sonos, Inc.",,{'ssdp_description': 'http://192.168.1.106:140...,121d5ed5e06b42f68bc2e621e76aa1f3
2,s5391507682,camera,Camera,WLAN,B-Link Electronic Limited,,{},121d5ed5e06b42f68bc2e621e76aa1f3
3,scbc253fdbc,DB2BCD,Irrigation controller,Rachio,AzureWave Technology Inc.,,{},121d5ed5e06b42f68bc2e621e76aa1f3
4,s0e81252086,Laserjet P1102W,Printer,HP,"Hon Hai Precision Ind. Co.,Ltd.",,{},121d5ed5e06b42f68bc2e621e76aa1f3


In [55]:
print('Feature dataframe samples:', len(feature_df))

Feature dataframe samples: 15932


In [56]:
# device_type number of categories
feature_df['device_oui'].value_counts()

Apple, Inc.                            1463
                                       1417
Google, Inc.                            903
Amazon Technologies Inc.                880
Sonos, Inc.                             515
                                       ... 
Savant Systems LLC                        1
PLUS  Corporation                         1
vivo Mobile Communication Co., Ltd.       1
ConversDigital Co., Ltd.                  1
Universal Remote Control, Inc.            1
Name: device_oui, Length: 592, dtype: int64

In [57]:
feature_df['device_vendor'].value_counts()

Apple      1356
Google     1293
Amazon      945
Samsung     524
Sonos       397
           ... 
verizon       1
ASUSTOR       1
sling         1
GAGN          1
Lenox         1
Name: device_vendor, Length: 1730, dtype: int64

In [58]:
# uncaptitalize vendors
feature_df['device_vendor'] = feature_df['device_vendor'].apply(str.lower)

In [59]:
labels_df.head(5)

Unnamed: 0,device_id,device_vendor,device_name
0,s67fbda64ef,amazon,echo
1,sf97a17203e,amazon,echo
2,s0c49160109,amazon,echo
3,s7711dea8a0,amazon,echo
4,sc8d6c3e4a7,amazon,echo


In [60]:
# number of device vendors
labels_df['device_vendor'].value_counts()

google      1871
amazon      1492
sonos        527
philips      434
tplink       415
            ... 
vmware        10
vera          10
trendnet      10
sky           10
sharp         10
Name: device_vendor, Length: 112, dtype: int64

In [61]:
print('Feature dataframe samples:', len(feature_df))
print('Label dataframe samples:', len(labels_df))
print('DNS dataframe samples:', len(dns_df))

Feature dataframe samples: 15932
Label dataframe samples: 10920
DNS dataframe samples: 412841


The feature dataframe is different from the labels dataframe

In [62]:
union = set(feature_df['device_id']) & set(labels_df['device_id'])
print('Actual samples in both features and lables df:', len(union))

Actual samples in both features and lables df: 9534


In [63]:
# unique dns devices
print('Unique dns devices:', len(set(dns_df['device_id'])))

Unique dns devices: 37219


In [64]:
dns_df.head(5)

Unnamed: 0,device_id,hostname
0,sb728a609e3,google.com
1,s929c53aa73,googleusercontent.com
2,sb728a609e3,gstatic.com
3,s929c53aa73,gstatic.com
4,s929c53aa73,google.com


In [65]:
# unique port devices
print('Unique port devices:', len(set(port_df['device_id'])))

Unique port devices: 15304


In [66]:
union = set(feature_df['device_id']) & set(labels_df['device_id']) & set(dns_df['device_id'])
print('Actual samples in features, label, and dns df:', len(union))

Actual samples in features, label, and dns df: 6442


In [67]:
# valid Netdisco entries in feature
net_info = feature_df['netdisco_device_info']
net_info = net_info.where(net_info != '{}').notnull()
print('Feature samples with Netdisco info:', net_info.sum())

Feature samples with Netdisco info: 4615


In [68]:
net_df = feature_df.iloc[[i for i in range(len(net_info)) if net_info.iloc[i] == True]]
print('Dataframe with netdisco device info:')
net_df.head(5)

Dataframe with netdisco device info:


Unnamed: 0,device_id,device_name,device_type,device_vendor,device_oui,dhcp_hostname,netdisco_device_info,user_key
1,s069dfcecf1,ZP80,Music server,sonos,"Sonos, Inc.",,{'ssdp_description': 'http://192.168.1.106:140...,121d5ed5e06b42f68bc2e621e76aa1f3
5,s53728fe968,Koogeek Switch,Smart Plug,koogeek,China Dragon Technology Limited,,"{'name': 'Koogeek-P1-77519D', 'hostname': 'hap...",d0e65960db07486ef774ec9de2739470
7,sf7c4b01cd5,TRADFRI gateway,IOT gateway,ikea,"Murata Manufacturing Co., Ltd.",,"{'name': 'TRADFRI gateway', 'hostname': 'TRADF...",d0e65960db07486ef774ec9de2739470
9,sa25da41547,Lyric,Thermostat,honeywell,ADEMCO,,"{'name': 'LyricStatA1F7CD', 'hostname': 'Lyric...",121d5ed5e06b42f68bc2e621e76aa1f3
19,sc4a556b8eb,Apple TV,TV Stick,apple,"Apple, Inc.",,"{'name': 'Apple TV Radcliffe', 'hostname': 'Ap...",fe510c48934c4861b6d1c8442d75cc57


In [69]:
union = set(net_df['device_id']) & set(labels_df['device_id']) 
print('Actual samples in features, label, with netdisco info:', len(union))

Actual samples in features, label, with netdisco info: 3539


### Data Analysis Conclusion
* Total samples for use: 9534
* Total samples with dns data: 6442
* Total samples with netdisco infor: 3539


Create dataframe with actual samples we can use

In [70]:
act_df = pd.merge(feature_df, labels_df, how='inner', on=['device_id'])
act_df.head(5)

Unnamed: 0,device_id,device_name_x,device_type,device_vendor_x,device_oui,dhcp_hostname,netdisco_device_info,user_key,device_vendor_y,device_name_y
0,scbc253fdbc,DB2BCD,Irrigation controller,rachio,AzureWave Technology Inc.,,{},121d5ed5e06b42f68bc2e621e76aa1f3,rachio,controller
1,s0e81252086,Laserjet P1102W,Printer,hp,"Hon Hai Precision Ind. Co.,Ltd.",,{},121d5ed5e06b42f68bc2e621e76aa1f3,hp,printer
2,s53728fe968,Koogeek Switch,Smart Plug,koogeek,China Dragon Technology Limited,,"{'name': 'Koogeek-P1-77519D', 'hostname': 'hap...",d0e65960db07486ef774ec9de2739470,koogeek,switch
3,s1de01f07f6,Amazon Fire Stick,TV Stick,amazon,Amazon Technologies Inc.,,{},d0e65960db07486ef774ec9de2739470,amazon,fire
4,sf7c4b01cd5,TRADFRI gateway,IOT gateway,ikea,"Murata Manufacturing Co., Ltd.",,"{'name': 'TRADFRI gateway', 'hostname': 'TRADF...",d0e65960db07486ef774ec9de2739470,ikea,tradfri


### Classification with only OUI

In [71]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [72]:
X = np.array(act_df['device_oui'])
y = np.array(act_df['device_vendor_y'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vectorizer = CountVectorizer(analyzer = 'word', token_pattern='.+')

In [73]:
X_train = vectorizer.fit_transform(X_train)
vectorizer.get_feature_names()



['abocom',
 'ademco',
 'alertme.com limited',
 'alpha networks inc.',
 'amazon technologies inc.',
 'amcrest technologies',
 'american power conversion corp',
 'ampak technology, inc.',
 'apc by schneider electric',
 'apple, inc.',
 'arcadyan corporation',
 'arcadyan technology corporation',
 'arris group, inc.',
 'aruba networks',
 'asix electronics corp.',
 'askey computer corp',
 'asustek computer inc.',
 'atheros communications, inc.',
 'avm audiovisuelles marketing und computersysteme gmbh',
 'avm gmbh',
 'axis communications ab',
 'azurewave technology inc.',
 'b-link electronic limited',
 'beijing xiaomi electronics co., ltd.',
 'belkin international inc.',
 'bose corporation',
 'broadlink pty ltd',
 'brother industries, ltd.',
 'bskyb ltd',
 'canary connect, inc.',
 'canon inc.',
 'chameleon technology (uk) limited',
 'chicony electronics co., ltd.',
 'china dragon technology limited',
 'cisco meraki',
 'cisco spvtg',
 'cisco systems inc',
 'cisco systems, inc',
 'cisco-linksys

In [74]:
# train
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [75]:
# test
X_test = vectorizer.transform(X_test)
model.score(X_test, y_test)

0.7713686418458311

In [76]:
# Big Problem. Cannot classify Apple Inc
model.predict(vectorizer.transform(['Apple, Inc.']))

array(['amazon'], dtype='<U17')

In [77]:
# Here's Why
b = feature_df.loc[feature_df['device_oui'] == 'Apple, Inc.']
print('There are {} devices with OUI "Apple, Inc." in original feature dataframe'.format(len(b)))
b = act_df.loc[act_df['device_oui'] == 'Apple, Inc.']
print('There are only {} devices with OUI "Apple, Inc." in actual dataframe'.format(len(b)))


There are 1463 devices with OUI "Apple, Inc." in original feature dataframe
There are only 12 devices with OUI "Apple, Inc." in actual dataframe


Alternative: Stick to feature dataframe instead, which has more samples

In [78]:
X = np.array(feature_df['device_oui'])
y = np.array(feature_df['device_vendor'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [79]:
vectorizer = CountVectorizer(analyzer = 'word', token_pattern='.+')
X_train = vectorizer.fit_transform(X_train)
vectorizer.get_feature_names()



['2n telekomunikace a.s.',
 '2wire inc',
 '7hugs labs',
 'aaeon technology inc.',
 'abocom',
 'actions microelectronics co., ltd',
 'actiontec electronics, inc',
 'ademco',
 'adtran inc',
 'advanced digital broadcast sa',
 'advantech co., ltd.',
 'airties wireless networks',
 'airvana, inc.',
 'alertme.com limited',
 'alpha networks inc.',
 'alpha technologies, inc.',
 'amazon technologies inc.',
 'amcrest technologies',
 'american power conversion corp',
 'ampak technology, inc.',
 'andon health co.,ltd.',
 'apc by schneider electric',
 'apple, inc.',
 'arcadyan corporation',
 'arcadyan technology corporation',
 'arcelik a.s',
 'arris group, inc.',
 'aruba networks',
 'asia pacific microsystems , inc.',
 'asix electronics corp.',
 'askey computer corp',
 'asrock incorporation',
 'asustek computer inc.',
 'atheros communications, inc.',
 'avm audiovisuelles marketing und computersysteme gmbh',
 'avm gmbh',
 'axis communications ab',
 'azurewave technologies (shanghai) inc.',
 'azurewav

In [80]:
# train
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [81]:
# test
X_test = vectorizer.transform(X_test)
model.score(X_test, y_test)

0.5814245371823031

Terrible Accuracy: Too many classes. Get rid of classes with less than 10 samples

In [82]:
# get rid of classes with less than 10 samples
# build Counter
from collections import Counter
class_set = Counter(feature_df['device_vendor'])
def check_value(key):
  return class_set[key]

In [83]:
# get rid of classes with less than 10 samples
act_df = feature_df[feature_df['device_vendor'].map(check_value) >= 10]

In [84]:
# check if successful
act_df['device_vendor'].value_counts()

apple                1690
google               1331
amazon               1095
samsung               555
sonos                 490
                     ... 
withings               10
etekcity               10
texas instruments      10
google chromecast      10
rasberry pi            10
Name: device_vendor, Length: 176, dtype: int64

In [85]:
act_df.head(5)

Unnamed: 0,device_id,device_name,device_type,device_vendor,device_oui,dhcp_hostname,netdisco_device_info,user_key
0,s2f7a763479,Pixel 2,Phone,google,HTC Corporation,,{},269aa83b6b844cf11be44d6dbea28868
1,s069dfcecf1,ZP80,Music server,sonos,"Sonos, Inc.",,{'ssdp_description': 'http://192.168.1.106:140...,121d5ed5e06b42f68bc2e621e76aa1f3
3,scbc253fdbc,DB2BCD,Irrigation controller,rachio,AzureWave Technology Inc.,,{},121d5ed5e06b42f68bc2e621e76aa1f3
4,s0e81252086,Laserjet P1102W,Printer,hp,"Hon Hai Precision Ind. Co.,Ltd.",,{},121d5ed5e06b42f68bc2e621e76aa1f3
5,s53728fe968,Koogeek Switch,Smart Plug,koogeek,China Dragon Technology Limited,,"{'name': 'Koogeek-P1-77519D', 'hostname': 'hap...",d0e65960db07486ef774ec9de2739470


In [86]:
# try again
X = np.array(act_df['device_oui'])
y = np.array(act_df['device_vendor'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [87]:
vectorizer = CountVectorizer(analyzer = 'word', token_pattern='.+')
X_train = vectorizer.fit_transform(X_train)
vectorizer.get_feature_names()



['abocom',
 'action star enterprise co., ltd.',
 'ademco',
 'alertme.com limited',
 'alpha networks inc.',
 'amazon technologies inc.',
 'amcrest technologies',
 'american power conversion corp',
 'ampak technology, inc.',
 'apc by schneider electric',
 'apple, inc.',
 'arcadyan corporation',
 'arcadyan technology corporation',
 'arris group, inc.',
 'asix electronics corp.',
 'askey computer corp',
 'asrock incorporation',
 'asustek computer inc.',
 'atheros communications, inc.',
 'avm audiovisuelles marketing und computersysteme gmbh',
 'avm gmbh',
 'axis communications ab',
 'azurewave technology inc.',
 'b-link electronic limited',
 'beijing xiaomi electronics co., ltd.',
 'belkin international inc.',
 'binatone electronics international, ltd',
 'bizlink (kunshan) co.,ltd',
 'bose corporation',
 'broadlink pty ltd',
 'brother industries, ltd.',
 'bskyb ltd',
 'canary connect, inc.',
 'canon inc.',
 'chameleon technology (uk) limited',
 'chicony electronics co., ltd.',
 'china drag

In [88]:
# train
model = MultinomialNB()
model.fit(X_train, y_train)
# test
X_test = vectorizer.transform(X_test)
model.score(X_test, y_test)

0.6956521739130435

In [89]:
from sklearn.metrics import confusion_matrix
from pycm import *

In [100]:
# plot Confusion Matrix
cm = ConfusionMatrix(actual_vector=y_test, predict_vector=model.predict(X_test), is_imbalanced=True)
cm.save_html('CM_OUI_FEATURE', normalize=True)

{'Status': True,
 'Message': 'c:\\Users\\willi\\Desktop\\iot-inspector\\device-identification\\testing\\CM_OUI_FEATURE.html'}

Danny specifically exclude Apple Inc. labels to focus on IoT devices


Conclusion: Focus on the 9534 samples within the clean label df

In [101]:
# merge feature and label dfs
act_df = pd.merge(feature_df, labels_df, how='inner', on=['device_id'])
# create X and y 
X = np.array(act_df['device_oui'])
y = np.array(act_df['device_vendor_y'])
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# bag of words
vectorizer = CountVectorizer(analyzer = 'word', token_pattern='.+')
X_train = vectorizer.fit_transform(X_train)
# train
model = MultinomialNB()
model.fit(X_train, y_train)
# test
X_test = vectorizer.transform(X_test)
model.score(X_test, y_test)

0.7829050865233351

In [102]:
# plot Confusion Matrix
cm = ConfusionMatrix(actual_vector=y_test, predict_vector=model.predict(X_test), is_imbalanced=True)
cm.save_html('CM_OUI_CLEAN', normalize=True)

{'Status': True,
 'Message': 'c:\\Users\\willi\\Desktop\\iot-inspector\\device-identification\\testing\\CM_OUI_CLEAN.html'}

In [110]:
# get accuracy per class
matrix = confusion_matrix(y_test, model.predict(X_test))
acc = matrix.diagonal()/matrix.sum(axis=1)

In [113]:
len(acc)

106

In [121]:
# plot accuarcy per class
import plotly.express as px

px.bar(x=sorted(set(y_test)), y=acc)

### Bag of Words Model
* Create a bag of words model with sklearn