In [41]:
#
# TRAIN
#
#%%writefile anomaly_detection_rf.py

import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import hashlib
import forestci as fci

from sklearn.model_selection import train_test_split
from sklearn.ensemble.forest import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn.externals import joblib

PROJECT_PATH = "/mnt/ebs/nfs/home/lucaminello/Projects/edw-monitoring/"


# Load File
def import_data(file_name):
    #data = np.loadtxt(f, delimiter=",")
    data = pd.io.parsers.read_csv(file_name)
    data = data.rename(columns={'dateKey': 'DateKey', 'hourKey': 'TimeKey'})
    data = data.sort_values(['DateKey', 'TimeKey'], ascending= [True,True]).reset_index(drop = True)
    print(len(data))
    data.dropna(inplace = True)
    print(len(data))
    #sel = VarianceThreshold(.001 )
    #sel.fit(data)
    #data = data[sel.get_support(indices=True)]
    
    return(data)


def build_dataset(data, lag, drop_first = 0):
    """
    lag: how many times you want to lag every features
    drop_first: minimum lag 
    """
    # Add new features
    data_shift = data
     
    for i in range((1+drop_first),lag):
        data_shift_t = data.shift(i).drop('TimeKey', axis=1).drop('DateKey', axis=1)
        data_shift_t = data_shift_t.rename(columns = lambda x: x + '__lagged__' + str(i) )
        data_shift = pd.concat([data_shift,data_shift_t], axis =1)

    # Remove first MAX_LAGS observations, because NaNs
    data_shift = data_shift.ix[(lag-1):]
    
    dateKey = data_shift['DateKey'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d00'))
    data_shift['DayOfWeek'] = dateKey.apply(lambda x: x.weekday())
    data_shift['Day'] = dateKey.apply(lambda x: x.day)
    data_shift = data_shift.drop('DateKey', axis=1)
     
    return(data_shift)


def split_dataset(data, target):
    """
        Create train and validation set
        Validation is the last 10% of the data
        Train the first 90%
    """
    
    X = data.drop(target, axis=1)
    y = data[target]

    # Build train set and test set
    size = X.shape[0]
    X_validation = X[(size - round(size/10)):]
    y_validation = y[(size - round(size/10)):]
    #X_train, _, y_train, _ = train_test_split(
    #    X[:(size - round(size/10))], 
    #    y[:(size - round(size/10))], 
    #    test_size=0.33, 
    #    random_state=42)
    X_train = X[:(size - round(size/10))]
    y_train = y[:(size - round(size/10))]
        
    return((X_train, X_validation, y_train, y_validation))


def train(X_train, y_train):
    # Regression Tree
    regr_rf = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=1234, n_jobs = 4)
    regr_rf.fit(X_train, y_train)
    return(regr_rf)


def measure_error(regr_rf, X, y):
    y_pred = regr_rf.predict(X)
    return(mean_absolute_error(y, y_pred))

    
def save_model(model, features, target):
    joblib.dump({'model': model, 'features': features , 'target': target } , 
                    'GAEnquiry/Model_' +  hashlib.md5(target.encode('utf-8')).hexdigest() +'.pkl') 
    
def train_all(dataset, targets, use_present_features = False):

    for target in targets:
        print(target)
        # Build train, validation and test
        X_train, X_validation, y_train, y_validation = split_dataset(dataset, target)
        
        # feature selection
        features = X_train.columns.tolist()
        
        ##
        ## Removed t = 0 cols (current value depends only on past values)
        ##       
        if not use_present_features:
            features = [feature for feature in features if '__lagged__' in feature]
            X_train = X_train[features]
        
        ##
        ## Remove features identical to the target        
        ##        
        
        #features = [feature for feature in features if abs(pearsonr(X_train[feature], y_train)[0]) < 0.95]
        #plot(X_train[target])
        
        X_train = X_train[features]
        
        # Select 100 most important
        sel = SelectKBest(f_regression, k=100)
        sel.fit(X_train,y_train)
        selected_features = [features[i] for i in sel.get_support(indices=True).tolist()]
       
        # Train model
        model =  train(X_train[selected_features], y_train)     
                       
        save_model(model, selected_features, target)
        
        # Measure accuracy
        error = measure_error(model, X_validation[selected_features], y_validation)        
        print("%s: (MAE %f) (Median %f) (STD %f) " % (target, error, np.median(y_validation), np.std(y_validation)))   
        
        gc.collect()
        
    return([])
        

def predict_all(data, targets):
    results = []
    data_features = set(data.columns.tolist())
    
    for target in targets:
        print("Load Model")
        model = load_model(target)

        if model:                        
            # Measure accuracy          
            if set(model['features']) <= data_features :
                x_test = np.asarray(data[model['features']])
                y_test = np.asarray(data[target])
            
                y_predict = model['model'].predict(x_test)      
                        
                error = measure_error(model['model'], x_test, y_test) 
                print(target)
                results += [{'target': target, 
                             'MAE': error['MAE'], 
                             'Median': np.median(data[target]), 
                             'STD': np.std(data[target]), 
                             'y_predict': y_predict,
                             'y_real' : y_test,
                             'err_down': error['err_down'],
                             'err_up': error['err_up']}]
                
                print("%s: (MAE %f) (Median %f) (STD %f) " % (target, error['MAE'], np.median(y_test), np.std(y_test)))    
            else:
                print("Model for %s contains unknown columns" % (target))
        else:
            print("Model for %s not found" % (target))
            
        
        gc.collect()
        
    return(results)
    

In [37]:
### LOAD DATA

gc.enable()
print("Load Data")
dataset = import_data("/mnt/ebs/nfs/home/lucaminello/Projects/edw-monitoring/data/GAEnquiry/carQSEvents_20150101_20170709.csv")

#dataset = dataset[(dataset.DateKey >= 2016110500) & (dataset.DateKey <= 2017060500) ].reset_index(drop = True)

print("Build dataset for regression")
dataset = build_dataset(dataset, lag=49, drop_first=0)

Load Data
20203
20166
Build dataset for regression


In [39]:
dataset

Unnamed: 0,TimeKey,carQsetSessions,page1ToPage2Conv,totPage1Hits,totPage2Hits,regnumBlankPerSession,regnumErrorPerSession,makeSelectedPerSession,makeOtherPerSession,modelSelectedPerSession,...,additionalDriverRemovePerSession__lagged__48,prefTopXPerSession__lagged__48,prefEmailPerSession__lagged__48,prefPhonePerSession__lagged__48,prefTextPerSession__lagged__48,phoneNumberPerSession__lagged__48,phoneNumberBlankPerSession__lagged__48,phoneNumberErrorPerSession__lagged__48,DayOfWeek,Day
48,18,120,0.783333,2421,1430,0.000000,0.000000,0.172414,0.163793,0.103448,...,0.000000,0.000000,1.000000,1.000000,1.000000,1.000000,2.000000,0.000000,3,12
49,19,131,0.770992,2298,1548,0.000000,0.000000,0.088000,0.056000,0.072000,...,0.000000,0.000000,1.000000,0.000000,0.000000,2.000000,0.000000,0.000000,3,12
50,20,133,0.887218,2867,1802,0.000000,0.000000,0.062016,0.054264,0.046512,...,0.000000,0.000000,1.000000,0.000000,0.000000,2.000000,0.000000,0.000000,3,12
51,21,119,0.873950,2352,1308,0.000000,0.000000,0.129310,0.181034,0.129310,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3,12
52,22,84,0.797619,1666,963,0.000000,0.000000,0.256098,0.182927,0.158537,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,3,12
53,23,41,0.731707,660,381,0.000000,0.000000,0.128205,0.102564,0.076923,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.500000,0.000000,3,12
54,0,54,0.814815,932,550,0.000000,0.000000,0.333333,0.254902,0.313725,...,0.000000,0.000000,0.100000,0.000000,0.200000,0.400000,0.000000,0.000000,4,13
55,1,23,0.869565,661,455,0.000000,0.000000,0.608696,0.086957,0.043478,...,0.000000,0.000000,0.200000,0.000000,0.000000,1.000000,0.400000,0.000000,4,13
56,2,11,0.909091,300,175,0.000000,0.000000,0.363636,0.363636,0.363636,...,0.000000,0.000000,0.000000,0.000000,0.500000,0.500000,0.000000,0.000000,4,13
57,3,11,0.818182,171,143,0.000000,0.000000,0.545455,0.181818,0.181818,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4,13


In [None]:
### BUILD MODEL

target_columns = [col for col in dataset.columns if col not in ('DateKey','TimeKey')]
#target_columns = target_columns[0:50]
#print(len(target_columns))

print("Train all targets")
models = train_all(dataset, target_columns, True) # target_columns)


In [42]:
### PREDICT

#targets = [col for col in dataset.columns if col not in ('DateKey','TimeKey')]

print("Predict All")
results = predict_all(dataset, ["claimSavePerSession"]) #targets)

# Build panda data frame
df_results = pd.DataFrame(results)


Predict All
Load Model
claimSavePerSession


IndexError: invalid index to scalar variable.

In [None]:
#dataset = import_data(PROJECT_PATH + "data/FctVehicleEnquiry/FctVehicleEnquiry.csv")

#dataset = dataset[dataset.DateKey >= 2017020500 ]
#features = [feature for feature in features if abs(pearsonr(X_train[feature], y_train)[0]) < 0.95]
dataset = import_data(PROJECT_PATH + "data/FctVehicleEnquiry/FctVehicleEnquiry.csv")
#dataset.describe()#
dataset[(dataset.DateKey >= 2016110500) & (dataset.DateKey <= 2017030500) ].reset_index(0)

In [23]:
#
# PREDICT
#
#%%writefile anomaly_detection_rf_predict.py

import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import os.path
import hashlib

from sklearn.model_selection import train_test_split
from sklearn.ensemble.forest import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

from sklearn.externals import joblib

PROJECT_PATH = "/mnt/ebs/nfs/home/lucaminello/Projects/edw-monitoring/"

# Load File
def import_data(file_name):
    data = pd.io.parsers.read_csv(file_name)
    data = data.sort_values(['DateKey', 'TimeKey'], ascending= [True,True]).reset_index(drop = True)        
    return(data)
    #return(data.ix[6000:].reset_index(drop = True))


def load_model(target):
    if os.path.isfile('GAEnquiry/Model_' +  hashlib.md5(target.encode('utf-8')).hexdigest() +'.pkl'):
        model = joblib.load('GAEnquiry/Model_' +  hashlib.md5(target.encode('utf-8')).hexdigest() +'.pkl')
        return(model)
    else:
        return({})
    

def measure_error(regr_rf, X, y):
    def pred_ints(model, X, percentile=95):
        err_down = []
        err_up = []
        for x in range(len(X)):        
            preds = []
            for pred in model.estimators_:
                preds.append(pred.predict(X[x].reshape(1, -1))[0])
            err_down.append(np.percentile(preds, (100 - percentile) / 2. ))
            err_up.append(np.percentile(preds, 100 - (100 - percentile) / 2.))
        return err_down, err_up

    y_pred = regr_rf.predict(X)
    
    err_down, err_up = pred_ints(regr_rf, X, percentile=95)

    
    return({'MAE' : mean_absolute_error(y, y_pred), 'err_down' : err_down, 'err_up' : err_up})


def build_dataset(data, lag, drop_first = 0):
    # Add new features
    data_shift = data
     
    for i in range((1+drop_first),lag):
        data_shift_t = data.shift(i).drop('TimeKey', axis=1).drop('DateKey', axis=1)
        data_shift_t = data_shift_t.rename(columns = lambda x: x + '__lagged__' + str(i) )
        data_shift = pd.concat([data_shift,data_shift_t], axis =1)

    # Remove first MAX_LAGS observations, because NaNs
    data_shift = data_shift.ix[(lag-1):]
    
    dateKey = data_shift['DateKey'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d00'))
    data_shift['DayOfWeek'] = dateKey.apply(lambda x: x.weekday())
    data_shift['Day'] = dateKey.apply(lambda x: x.day)
    #data_shift = data_shift.drop('DateKey', axis=1)
     
    return(data_shift)



gc.enable()
print("Load Data")
#dataset = import_data(PROJECT_PATH + "data/FctVehicleEnquiry/FctVehicleEnquiry.csv")
targets = [col for col in dataset.columns if col not in ('DateKey','TimeKey')]

print("Build dataset for regression")
dataset = build_dataset(dataset, lag=49, drop_first=0)

print("Predict All")
results = predict_all(dataset, ["claimSavePerSession"]) #targets)

# Build panda data frame
df_results = pd.DataFrame(results)

Load Data
Build dataset for regression
Predict All
Load Model
claimSavePerSession: (MAE 0.012236) (Median 0.098039) (STD 0.077954) 


In [35]:
dataset

Unnamed: 0,DateKey,TimeKey,carQsetSessions,page1ToPage2Conv,totPage1Hits,totPage2Hits,regnumBlankPerSession,regnumErrorPerSession,makeSelectedPerSession,makeOtherPerSession,...,additionalDriversPerSession,additionalDriverAddPerSession,additionalDriverRemovePerSession,prefTopXPerSession,prefEmailPerSession,prefPhonePerSession,prefTextPerSession,phoneNumberPerSession,phoneNumberBlankPerSession,phoneNumberErrorPerSession
0,2015030500,12,2,0.500000,121,8,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000,1.000000,2.000000,0.000000
1,2015030500,14,1,1.000000,26,12,0.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,2.000000,0.000000,0.000000
2,2015030500,15,1,1.000000,59,88,0.000000,0.000000,0.000000,0.000000,...,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,2.000000,0.000000,0.000000
3,2015030500,18,1,1.000000,13,6,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2015030500,19,1,1.000000,23,10,0.000000,0.000000,0.000000,0.000000,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
5,2015030500,21,2,1.000000,50,26,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.500000,0.000000
6,2015031000,0,14,0.714286,198,147,0.000000,0.000000,0.214286,0.142857,...,0.300000,0.000000,0.000000,0.000000,0.100000,0.000000,0.200000,0.400000,0.000000,0.000000
7,2015031000,1,6,0.833333,117,92,0.000000,0.000000,0.000000,0.000000,...,0.400000,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,1.000000,0.400000,0.000000
8,2015031000,2,2,1.000000,55,51,0.000000,0.000000,5.500000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.500000,0.000000,0.000000
9,2015031000,3,2,0.500000,25,5,0.000000,0.000000,0.500000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
target_name = "NA_FctVehicleEnquiry_TotalNumberOfClaims_CATEGORICAL_0"
model = load_model(target_name)
#selected_features = [feature for feature in  dataset.columns.tolist() if feature.startswith("NA_FctHomeEnquiry_EnquiryKey_COUNT")] 
print("Top features for %s "  % (target_name))
get_top_features(model['model'], model["features"], dataset, target_name, n = 10) 

def find_anomaly(results, i):
    delta = abs(results.iloc[i].y_real - results.iloc[i].y_predict)
    anomaly_position = (((results.iloc[i].y_real - np.asarray(results.iloc[i].err_up)) > 0) + \
      ((np.asarray(results.iloc[i].err_down) - results.iloc[i].y_real) > 0))    
    
    anomaly_strenth = delta#/max(results.iloc[i].y_real) * anomaly_position
    
    return(anomaly_strenth)

anomaly_pos = 224
i=0
#plt.plot(np.asarray(dataset.iloc[(anomaly_pos-24*1): (anomaly_pos + 24*1)][target_name]))
#err = find_anomaly(df_results,i)
#plt.plot(np.asarray(df_results.iloc[i].y_real[(anomaly_pos-24*1): (anomaly_pos + 24*1)]),linewidth=3, linestyle="--")
#plt.plot(np.asarray(df_results.iloc[i].y_predict[(anomaly_pos-24*1): (anomaly_pos + 24*1)]), linewidth=2)
#plt.plot(np.asarray(df_results.iloc[i].err_down[(anomaly_pos-24*1): (anomaly_pos + 24*2)]), linestyle=":")
#plt.plot(np.asarray(df_results.iloc[i].err_up[(anomaly_pos-24*1): (anomaly_pos + 24*2)]), linestyle=":")



In [31]:
#
# ANALYSE
#
from scipy.stats import pearsonr
import datetime
import forestci as fci
import hashlib

#target_name = "NA_FctHomeEnquiry_EnquiryKey_COUNT" 
#model = load_model(target_name)
# calculate inbag and unbiased variance

def load_metric(file_path, file_name):
    return(pd.read_csv(file_path + "/" + file_name))

def save_anomaly(anomaly_data, metric_data, target_name):
#c("anomaly_sha1","channel_name","schema_name","table_name",
#                        "col_name","anomaly_col_name", "anomaly_start", "anomaly_end",
#                        "anomaly_type", "anomaly_error", "anomaly_description")

    channel_name = "FctHomeEnquiry"
    target_name_split = target_name.split("_")
    
    short_anomaly_col_name = "_".join(target_name_split[0:4])
    anomaly_dtls = metric_data[metric_data["FEATURE_NAME"] == short_anomaly_col_name]
    
    schema_name =  anomaly_dtls["TABLE_SCHEM"].iat[0]
    table_name =  anomaly_dtls["TABLE_NAME"].iat[0]
    col_name =  anomaly_dtls["COLUMN_NAME"].iat[0]
    anomaly_col_name = target_name

    datetime_str = str(int(anomaly_data["DateKey"])+int(anomaly_data["TimeKey"]))
    

    anomaly_start = datetime.strptime(datetime_str, '%Y%m%d%H').strftime("%Y-%m-%dT%H:%M:%SZ")
    anomaly_end = datetime.strptime(datetime_str, '%Y%m%d%H').strftime("%Y-%m-%dT%H:%M:%SZ")
    anomaly_type = "[Regression]"
    anomaly_error = "1"
    anomaly_description = "Something bad"
    
    anomaly_sha1 = hashlib.sha1("".join([channel_name, target_name, anomaly_start, anomaly_type, anomaly_error]).encode('utf-8')).hexdigest()
    
    return([anomaly_sha1, channel_name,schema_name,table_name,col_name,anomaly_col_name,anomaly_start,anomaly_end,
                        anomaly_type, anomaly_error, anomaly_description])


def get_top_features(regr_rf, columns, data, target, n=100):
    importances = regr_rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    print("")
    for f in range(n):
        p = pearsonr(data[columns[indices[f]]], data[target])
        print("%d. %s (Imp: %f) (corr: %.3f - p: %.6f)" % (f + 1, columns[indices[f]], importances[indices[f]],round(p[0],3), round(p[1],6)))
        

def find_anomaly(results, i):
    delta = abs(results.iloc[i].y_real - results.iloc[i].y_predict)
    anomaly_position = (((results.iloc[i].y_real - np.asarray(results.iloc[i].err_up)) > 0) + \
      ((np.asarray(results.iloc[i].err_down) - results.iloc[i].y_real) > 0))    
    
    anomaly_strenth = delta/max(results.iloc[i].y_real) * anomaly_position
    
    return(anomaly_strenth)
    
#dataset = import_data(PROJECT_PATH + "data/FctHomeEnquiry/FctHomeEnquiry.csv")
#targets = [col for col in dataset.columns if col not in ('DateKey','TimeKey')]

#print("Build dataset for regression")
#dataset = build_dataset(dataset, lag=49, drop_first=0)

# Enable this if you want to save the anomalies
# metric_data = load_metric("../../config/FctHomeEnquiry", "FctHomeEnquiry_metric.csv")
# fd = open('../anomalies.csv','a')
#######

i = 0

#for i in range(len(df_results)):
err = find_anomaly(df_results, i)
target_name = df_results.iloc[i].target

#    for anomaly in dataset.iloc[np.where(err > 0.5)].iterrows():
#        print(str(i) + " " + target_name)
#        anomaly_to_save = save_anomaly(anomaly[1], metric_data, target_name)
#        fd.write(",".join(anomaly_to_save))
#        fd.write("\n")

#fd.close()    
    
    
model = load_model(target_name)
#selected_features = [feature for feature in  dataset.columns.tolist() if feature.startswith("NA_FctHomeEnquiry_EnquiryKey_COUNT")] 
print("Top features for %s "  % (target_name))
get_top_features(model['model'], model["features"], dataset, target_name, n = 10) 

anomaly_pos = 500
#plt.plot(np.asarray(dataset.iloc[(anomaly_pos-24*1): (anomaly_pos + 24*1)][target_name]))
timeline = dataset.DateKey[(anomaly_pos-24*1): (anomaly_pos + 24*1)] + dataset.TimeKey[(anomaly_pos-24*1): (anomaly_pos + 24*1)]
timeline = timeline.astype(str)

timeline = timeline.apply(lambda x: datetime.strptime(str(x), '%Y%m%d%H')).tolist()

plt.plot(timeline,np.asarray(df_results.iloc[i].y_real[(anomaly_pos-24*1): (anomaly_pos + 24*1)]),linewidth=3, linestyle="--")
plt.plot(timeline, np.asarray(df_results.iloc[i].y_predict[(anomaly_pos-24*1): (anomaly_pos + 24*1)]), linewidth=2)
#plt.plot(np.asarray(df_results.iloc[i].err_down[(anomaly_pos-24*1): (anomaly_pos + 24*2)]), linestyle=":")
#plt.plot(np.asarray(df_results.iloc[i].err_up[(anomaly_pos-24*1): (anomaly_pos + 24*2)]), linestyle=":")
#plt.plot(timeline, np.asarray(err[(anomaly_pos-24*1): (anomaly_pos + 24*10)]))
plt.show()



Top features for claimSavePerSession 

1. claimNCBaffectedPerSession (Imp: 0.925501) (corr: 0.671 - p: 0.000000)
2. claimAccidentFaultPerSession (Imp: 0.010182) (corr: 0.650 - p: 0.000000)
3. claimTypePerSession (Imp: 0.010114) (corr: 0.674 - p: 0.000000)
4. claimAddPerSession (Imp: 0.004828) (corr: 0.412 - p: 0.000000)
5. claimDateErrorPerSession (Imp: 0.002981) (corr: 0.036 - p: 0.000000)
6. claimsExistInteractionPerSession (Imp: 0.001944) (corr: 0.397 - p: 0.000000)


KeyError: 'dlYearsPerSession__lagged__23'

In [None]:
from scipy.stats import pearsonr
import forestci as fci
import hashlib

#target_name = "NA_FctHomeEnquiry_EnquiryKey_COUNT" 
#model = load_model(target_name)
# calculate inbag and unbiased variance
def load_metric(file_path, file_name):
    return(pd.read_csv(file_path + "/" + file_name))


metric_data = load_metric("../../config/FctHomeEnquiry", "FctHomeEnquiry_metric.csv")
#dataset.head()
#print(type(str(120)))
dataset.iloc[np.where(err > 0.5)][0:1]
fd = open('../anomalies.csv','a')

for anomaly in dataset.iloc[np.where(err > 0.5)].iterrows():
    anomaly_to_save = save_anomaly(anomaly[1], metric_data, target_name)
    fd.write(",".join(anomaly_to_save))
    fd.write("\n")
    
fd.close()    

In [None]:
print(target_name)
metric_data[[x.startswith("NA_FctHomeEnquiry_Nu") for x in metric_data["FEATURE_NAME"]]]

In [26]:
%pylab inline
pylab.rcParams['figure.figsize'] = (16, 10)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [None]:
list(dataset.columns)

In [None]:
def pred_ints(model, X, percentile=99.999):
    err_down = []
    err_up = []
    for x in range(len(X)):        
        preds = []
        for pred in model.estimators_:
            preds.append(pred.predict(X.iloc[x:(x+1)])[0])
        err_down.append(np.percentile(preds, (100 - percentile) / 2. ))
        err_up.append(np.percentile(preds, 100 - (100 - percentile) / 2.))
    return err_down, err_up


err_down, err_up = pred_ints(model['model'],dataset[model['features']].iloc[0:200], percentile=99)
#print(err_down)
#print(err_up)

correct  = 0 
for i, val in enumerate(np.asarray(dataset[target_name].iloc[0:200])):
    if err_down[i] <= val <= err_up[i]:
        correct += 1
        
print(correct)
print(correct/200.0)

plt.plot(err_up, linestyle=":")
plt.plot(err_down,linestyle = ":")
plt.plot(np.asarray(dataset[target_name].iloc[0:200]))

plt.show()

dataset.head()


In [None]:
print(dataset[target_name].iloc[0:1])
model['model'].estimators_[0].predict(dataset[model['features']].iloc[0:1])[0]

In [None]:
# Check Univariate correlation on feature importance and target



target_name = "NA_FctHomeEnquiry_EnquiryKey_COUNT"
model = load_model(target_name)

plt.plot(dataset["PropertyProfileKey_DimPropertyProfile_NumberOfOtherRoomsBand_CATEGORICAL_2"][1:100])
plt.plot(dataset["NA_FctHomeEnquiry_NumberOfOtherRooms_CATEGORICAL_2"][1:100])
plt.show()

#print(model['features'])
#print(dataset.columns.tolist())