Import necessary libararies; DO NOT MODIFY

In [1]:
import pandas as pd
import hashlib

# Configuration
Change the values of these variables in order to configure this notebook to properly sanitize your wifi data. This script expects that your column labels will include the labels "SSID" and "MAC" in all caps. If your data does not contain these columns labeled in this manner, you will have to modify the code further down below. 

- `filename` - string describing the full path or relative path to your input data
- `columns_labeled` - boolean describing if your file already contains header values for your table
- `colum_lables` - list of strings with your column labels, if you data already contains headers set this to an empty list
- `key_file` - string describing the full or relative path to your file containing your salt value. Make sure this file does not get uploaded to your public Git repositories
- `output_name` - string describing the path and file name you'd like this script to output to

In [6]:
filename = "../../wap-20190612.log" #insert the full path here
columns_labeled = False
#make sure these are all caps, must include labels "SSID" and "MAC"
column_labels = ['MAC', 'SSID', 'RSSI', 'CHANNEL', 'SENSOR', 'EPOCH']
key_file = "../../key_file.txt"
output_name = 'clean_wap_log_20190612.csv'

# Data Loading
The following three cells will
- Load your secret key from a file
- Load the OUI data
- Load the WiFi data provided

In [7]:
secret_key = open(key_file, 'r').read()

In [8]:
oui_df = pd.read_csv('../data/oui.txt', header=None, names=['OUI','CO', 'COMPANY'], sep='\t')
oui_dict = { oui:company for oui,company in zip(oui_df.OUI, oui_df.CO)}

In [9]:
if columns_labeled and len(column_labels) > 0:
    df = pd.read_csv(filename, float_precision='high')
    df.columns=column_labels
elif columns_labeled:
    df = pd.read_csv(filename, float_precision='high')
else:
    df = pd.read_csv(filename, header=None, names=column_labels, float_precision='high')
df.head()

Unnamed: 0,MAC,SSID,RSSI,CHANNEL,SENSOR,EPOCH
0,b0:b9:8a:cf:d8:28,NETGEAR36,0,1,Pi3B,1559696000.0
1,4c:01:43:0a:09:e5,,0,1,Pi3B,1559696000.0
2,94:8f:cf:0c:97:50,ATTXVFJQXi,0,1,Pi3B,1559696000.0
3,0c:ea:c9:a8:6e:b0,ATTsBATbaa,0,1,Pi3B,1559696000.0
4,88:96:4e:4e:bb:70,ATTDPjigDS,0,1,Pi3B,1559696000.0


# Enrichment
The following cell will enrich your data with
- The OUI of the observed WAP
- Company names based on the OUI
- A unique hash allowing you to anonymize the data

In [10]:
df['OUI'] = [':'.join(mac.split(':')[0:3]).upper() for mac in df["MAC"]]
#m = hashlib.md5(b'18:9c:27:21:35:b0').hexdigest()
df['HASH'] = [hashlib.md5((mac+str(ssid)+secret_key).encode('UTF-8')).hexdigest() for mac,ssid in zip(df.MAC, df.SSID)]
df['CO'] = [oui_dict[oui] if oui in oui_dict.keys() else 'UNKNOWN' for oui in df['OUI']]
df.head()

Unnamed: 0,MAC,SSID,RSSI,CHANNEL,SENSOR,EPOCH,OUI,HASH,CO
0,b0:b9:8a:cf:d8:28,NETGEAR36,0,1,Pi3B,1559696000.0,B0:B9:8A,0af6b9c7ccb0ca7105303b3cad7ec22f,Netgear
1,4c:01:43:0a:09:e5,,0,1,Pi3B,1559696000.0,4C:01:43,172b5824dbe65036406783398672bf7a,Eero
2,94:8f:cf:0c:97:50,ATTXVFJQXi,0,1,Pi3B,1559696000.0,94:8F:CF,ac31484895c5705d38fc292605778dbc,ArrisGro
3,0c:ea:c9:a8:6e:b0,ATTsBATbaa,0,1,Pi3B,1559696000.0,0C:EA:C9,eab56c71ae77fae1653f486aa3b3a93b,ArrisGro
4,88:96:4e:4e:bb:70,ATTDPjigDS,0,1,Pi3B,1559696000.0,88:96:4E,f4b963655e9bf59f89dcc09396468861,ArrisGro


# Clean and Output
The next two cells will drop the individually identifying data and output the cleaned dataframe to your output file

In [11]:
clean = df.drop(columns=['MAC','SSID'])
clean.head()

Unnamed: 0,RSSI,CHANNEL,SENSOR,EPOCH,OUI,HASH,CO
0,0,1,Pi3B,1559696000.0,B0:B9:8A,0af6b9c7ccb0ca7105303b3cad7ec22f,Netgear
1,0,1,Pi3B,1559696000.0,4C:01:43,172b5824dbe65036406783398672bf7a,Eero
2,0,1,Pi3B,1559696000.0,94:8F:CF,ac31484895c5705d38fc292605778dbc,ArrisGro
3,0,1,Pi3B,1559696000.0,0C:EA:C9,eab56c71ae77fae1653f486aa3b3a93b,ArrisGro
4,0,1,Pi3B,1559696000.0,88:96:4E,f4b963655e9bf59f89dcc09396468861,ArrisGro


In [12]:
clean.to_csv("../data/"+output_name)

In [13]:
df.to_csv("../../wap_20190612_full.csv")