Anomaly Detector: Reusing the multi-threaded dataset emitter from the previous question, design an algorithm that uses streaming frequency estimation to build a representation of feature distributions. Using this information, you will be able to flag incoming records as either normal or anomalous, but also update your anomaly detector over time so it adapts to gradual changes in the input streams.

Doing this across all spatial locations in the dataset clearly will not work; data points from Florida would be considered anomalous if they were found in a stream describing Montana. Come up with a strategy to avoid this problem.
Track each feature in the dataset and produce an anomaly score: the more features in a given observation that are considered anomalous, the higher the score. One approach could be 0% = no anomalies, 100% = all features were flagged as anomalous.

In [34]:
import datetime
import math
import geohash
gh_features_dict = {} # wxyz -> (count, pmw, ps, pt, hum, sd, ts, tt, prec, vs, vis) 

ano_dict = {}

def update_mean(newValue, count, existingAggregate):   
    count = count + 1
    (mean, M2) = existingAggregate
    delta = newValue - mean
    mean += delta / count
    delta2 = newValue - mean
    M2 += delta * delta2
    return (mean, M2)

def is_anomalous(value, mean, sampleVarience):
    standard_deviation = math.sqrt(sampleVarience)
    upper_bound = mean + (3 * standard_deviation)
    lower_bound = mean - (3 * standard_deviation)
    if(value > upper_bound or value < lower_bound):
        return True
    else: 
        return False
    
    
def parseLine(line):
    if line.startswith('1_'):
        return 
    variables = line.split("\t")
    
    milliseconds = int(variables[0])
    dt = datetime.datetime.fromtimestamp(milliseconds/1000.0)
    lat = float(variables[1])
    lon = float(variables[2])
    pressure_maximum_wind = float(variables[5])
    pressure_surface = float(variables[6])
    pressure_tropopause = float(variables[7])
    humidity = float(variables[8])
    snow_depth_surface = float(variables[9])
    temperature_surface = float(variables[10])
    temperature_tropopause = float(variables[11])
    precipitation = float(variables[13])
    vegetation_surface = float(variables[14])                          
    visibility = float(variables[15])
    
    newValues = [1, pressure_maximum_wind, pressure_surface, pressure_tropopause, humidity, \
                snow_depth_surface, temperature_surface, temperature_tropopause, precipitation, \
                vegetation_surface, visibility]
    gh = geohash.encode(lat, lon)[0:6]
    
    anomalous_score = 0
    if gh in gh_features_dict:
        features = gh_features_dict[gh]
        cnt = features[0]
        if(cnt > 3):
            for i in range(1, 10):
                (mean, M2) = features[i]
                sampleVariance =  M2 / (cnt - 1)
                anomalous_score += 10 if is_anomalous(newValues[i], mean, sampleVariance) else 0
    else : 
        features = [0, (0,0), (0,0), (0,0), (0,0), (0,0), (0,0), (0,0), (0,0), (0,0), (0,0)]
    
    cnt = features[0]
    features[0] = cnt + 1
    for i in range(1, 10) : 
        features[i]  = update_mean(newValues[i], cnt, features[i])
    gh_features_dict[gh] = features
    
    if anomalous_score > 30:
        print(line)
    
                

                    

In [35]:
f = open('sample.txt', 'r')
for line in f.readlines() :
    parseLine(line)


    

1450299600000	51.12617467345033	-91.65211896209196	25.5	null	33323.61	95117.0	30119.004	85.0	0.24399999	264.51862	221.40625	null	2.125	1.0	1621.8237	0.0475	null

1450299600000	47.06974253786414	-96.99679013665887	67.5	null	13323.611	96636.0	34719.004	88.0	0.12719999	268.76862	226.53125	null	0.125	1.0	6021.8237	0.12	null

1450299600000	37.796356518329965	-103.72296470055775	66.25	null	17923.611	85352.0	29319.004	80.0	0.0448	273.01862	223.03125	null	0.0	7.0	24221.824	0.083749995	null

1450299600000	45.08247255624026	-113.43216886254368	43.0	null	29523.611	77470.0	38519.004	88.0	0.28	268.39362	227.90625	null	2.375	1.0	3221.8237	0.06625	null

1450299600000	43.7406732146168	-102.95182169302863	70.0	null	29723.611	90353.0	27519.004	89.0	0.14999999	271.39362	220.03125	null	0.0	7.0	24221.824	0.0275	null

1450299600000	42.34764178618597	-95.91195493224608	23.5	null	18523.611	95996.0	31519.004	74.0	0.0052	272.39362	223.65625	null	0.0	6.0	24221.824	0.083749995	null

1450299600000	48.1035717889267