# Fraud Detection  Using Isolation Forest

## Introduction

In this notebook we use the **Isolation Forest** algorithm to get anomaly score of each sample. This algorithm was built based on the fact that anomalies are the data points that are “few and different”. The core idea is that it should be very easy to “isolate” anomalies based on the characteristics that make them unique.

![](https://#STORAGE_ACCOUNT_NAME#.blob.core.windows.net/mlimages/business-analytics-statistics-businessman-with-report-using-magnifying-glass.jpg?sp=r&st=2021-10-22T23:07:34Z&se=2022-12-31T08:07:34Z&spr=https&sv=2020-08-04&sr=b&sig=WqhLvSM8frL7LuWIrLMaM7IA%2BtoU1l%2B95S8LY2qdDgg%3D)

#### &emsp; Loading the dataset from Azure Blob Storage

In [1]:
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
from azure.storage.blob import ContainerClient, BlobClient
import pandas as pd
from io import BytesIO
from copy import deepcopy
import logging
from sklearn.ensemble import IsolationForest
import pickle
import GlobalVariables

CONNECTIONSTRING = GlobalVariables.CONNECTIONSTRING_first
CONTAINER_NAME = GlobalVariables.CONTAINER_NAME_first

BLOBNAME = GlobalVariables.BLOBNAME_first
blob = BlobClient.from_connection_string(conn_str=CONNECTIONSTRING, container_name=CONTAINER_NAME, blob_name=BLOBNAME)
blob_data = blob.download_blob()
BytesIO(blob_data.content_as_bytes())
data = pd.read_csv(BytesIO(blob_data.content_as_bytes()))

## VARIABLE CREATION

Area Variables

In [2]:
data_loaded = deepcopy(data)
data_loaded["LOTAREA"]=data_loaded.LTFRONT*data_loaded.LTDEPTH
data_loaded["BLDAREA"]=data_loaded.BLDFRONT*data_loaded.BLDDEPTH
data_loaded["BLDVOL"]=data_loaded.BLDAREA*data_loaded.STORIES

In [3]:
data_loaded["FULLVAL/LOTAREA"]=data_loaded.FULLVAL/data_loaded.LOTAREA
data_loaded["FULLVAL/BLDAREA"]=data_loaded.FULLVAL/data_loaded.BLDAREA
data_loaded["FULLVAL/BLDVOL"]=data_loaded.FULLVAL/data_loaded.BLDVOL
data_loaded["AVLAND/LOTAREA"]=data_loaded.AVLAND/data_loaded.LOTAREA
data_loaded["AVLAND/BLDAREA"]=data_loaded.AVLAND/data_loaded.BLDAREA
data_loaded["AVLAND/BLDVOL"]=data_loaded.AVLAND/data_loaded.BLDVOL
data_loaded["AVTOT/LOTAREA"]=data_loaded.AVTOT/data_loaded.LOTAREA
data_loaded["AVTOT/BLDAREA"]=data_loaded.AVTOT/data_loaded.BLDAREA
data_loaded["AVTOT/BLDVOL"]=data_loaded.AVTOT/data_loaded.BLDVOL

In [4]:
data_loaded.head(5)

Unnamed: 0,BBLE,B,BLOCK,LOT,EASEMENT,OWNER,BLDGCL,TAXCLASS,LTFRONT,LTDEPTH,...,BLDVOL,FULLVAL/LOTAREA,FULLVAL/BLDAREA,FULLVAL/BLDVOL,AVLAND/LOTAREA,AVLAND/BLDAREA,AVLAND/BLDVOL,AVTOT/LOTAREA,AVTOT/BLDAREA,AVTOT/BLDVOL
0,1000010010,1,1,10,,UNITED STATES OF AMER,Y4,4,198,355,...,759000.0,4325.537061,20029.11726,400.582345,1369.026889,6339.189723,126.783794,1946.491677,9013.102767,180.262055
1,1000010101,1,1,101,,U S GOVT LAND & BLDGS,P7,4,500,1046,...,129150.0,40.917782,8284.939992,165.6988,8.07935,1635.888502,32.71777,18.413002,3728.222997,74.56446
2,1000010201,1,1,201,,U S GOVT LAND & BLDGS,Z9,4,27,177,...,419950.0,40552.416824,23074.175497,461.48351,2994.350282,1703.774259,34.075485,18248.587571,10383.378974,207.667579
3,1000020001,1,2,1,,DEPT OF GENERAL SERVI,Y7,4,709,564,...,1199628.0,261.796157,261.796157,87.265386,97.551991,97.551991,32.51733,117.808271,117.808271,39.269424
4,1000020023,1,2,23,,DEPARTMENT OF BUSINES,T2,4,793,551,...,93670.0,89.714219,836.98089,418.490445,34.913021,325.717946,162.858973,40.371399,376.641401,188.3207


In [5]:
data_loaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663276 entries, 0 to 663275
Data columns (total 43 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BBLE             663276 non-null  object 
 1   B                663276 non-null  int64  
 2   BLOCK            663276 non-null  int64  
 3   LOT              663276 non-null  int64  
 4   EASEMENT         2010 non-null    object 
 5   OWNER            638014 non-null  object 
 6   BLDGCL           663276 non-null  object 
 7   TAXCLASS         663276 non-null  object 
 8   LTFRONT          663276 non-null  int64  
 9   LTDEPTH          663276 non-null  int64  
 10  EXT              149682 non-null  object 
 11  STORIES          663276 non-null  float64
 12  FULLVAL          663276 non-null  float64
 13  AVLAND           663276 non-null  float64
 14  AVTOT            663276 non-null  float64
 15  EXLAND           663276 non-null  float64
 16  EXTOT            663276 non-null  floa

In [6]:
n = len(pd.unique(data_loaded['BBLE']))
print(n)

663276


In [7]:
# Creating ZIP3 by taking first 3 digits of ZIP code
d1=data_loaded.ZIP.astype(str)
data_loaded["ZIP3"]=[x[0:3] for x in d1]

In [8]:
# lambda function to create variables by dividing the 9 normalised variables with the grouped averages 
f= lambda x:x/x.mean()

In [9]:
# Fullval & Lotarea
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["FULLVAL/LOTAREA_ZIP",
"FULLVAL/LOTAREA_ZIP3",
"FULLVAL/LOTAREA_TAXCLASS",
"FULLVAL/LOTAREA_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["FULLVAL/LOTAREA"].apply(f)

In [10]:

# Fullval & Bldarea
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["FULLVAL/BLDAREA_ZIP",
"FULLVAL/BLDAREA_ZIP3",
"FULLVAL/BLDAREA_TAXCLASS",
"FULLVAL/BLDAREA_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["FULLVAL/BLDAREA"].apply(f)


In [11]:

# Fullval & Bldvol
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["FULLVAL/BLDVOL_ZIP",
"FULLVAL/BLDVOL_ZIP3",
"FULLVAL/BLDVOL_TAXCLASS",
"FULLVAL/BLDVOL_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["FULLVAL/BLDVOL"].apply(f)

In [12]:
# Avland & Lotarea
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["AVLAND/LOTAREA_ZIP",
"AVLAND/LOTAREA_ZIP3",
"AVLAND/LOTAREA_TAXCLASS",
"AVLAND/LOTAREA_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["AVLAND/LOTAREA"].apply(f)

In [13]:
# Avland & Bldarea
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["AVLAND/BLDAREA_ZIP",
"AVLAND/BLDAREA_ZIP3",
"AVLAND/BLDAREA_TAXCLASS",
"AVLAND/BLDAREA_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["AVLAND/BLDAREA"].apply(f)

In [14]:
# Avland & Bldvol
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["AVLAND/BLDVOL_ZIP",
"AVLAND/BLDVOL_ZIP3",
"AVLAND/BLDVOL_TAXCLASS",
"AVLAND/BLDVOL_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["AVLAND/BLDVOL"].apply(f)

In [15]:
# Avtot & LotArea
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["AVTOT/LOTAREA_ZIP",
"AVTOT/LOTAREA_ZIP3",
"AVTOT/LOTAREA_TAXCLASS",
"AVTOT/LOTAREA_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["AVTOT/LOTAREA"].apply(f)

In [16]:
# Avtot & BldArea
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["AVTOT/BLDAREA_ZIP",
"AVTOT/BLDAREA_ZIP3",
"AVTOT/BLDAREA_TAXCLASS",
"AVTOT/BLDAREA_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["AVTOT/BLDAREA"].apply(f)

In [17]:

#  Avtot & BldVol
cols = ["ZIP","ZIP3","TAXCLASS","B"]
var = ["AVTOT/BLDVOL_ZIP",
"AVTOT/BLDVOL_ZIP3",
"AVTOT/BLDVOL_TAXCLASS",
"AVTOT/BLDVOL_B"]
for i,j in zip(cols,var):
    data_loaded[j]=data_loaded.groupby([i])["AVTOT/BLDVOL"].apply(f)


In [18]:
# 9 Variables by dividing with the mean of the whole column and not the group
a=data_loaded["FULLVAL/LOTAREA"].mean()
b=data_loaded["FULLVAL/BLDAREA"].mean()
c=data_loaded["FULLVAL/BLDVOL"].mean()
k=data_loaded["AVLAND/LOTAREA"].mean()
e=data_loaded["AVLAND/BLDAREA"].mean()
f=data_loaded["AVLAND/BLDVOL"].mean()
g=data_loaded["AVTOT/LOTAREA"].mean()
h=data_loaded["AVTOT/BLDAREA"].mean()
i=data_loaded["AVTOT/BLDVOL"].mean()
data_loaded["FULLVAL/LOTAREA_FULL"]=data_loaded["FULLVAL/LOTAREA"]/a
data_loaded["FULLVAL/BLDAREA_FULL"]=data_loaded["FULLVAL/BLDAREA"]/b
data_loaded["FULLVAL/BLDVOL_FULL"]=data_loaded["FULLVAL/BLDVOL"]/c
data_loaded["AVLAND/LOTAREA_FULL"]=data_loaded["AVLAND/LOTAREA"]/k
data_loaded["AVLAND/BLDAREA_FULL"]=data_loaded["AVLAND/BLDAREA"]/e
data_loaded["AVLAND/BLDVOL_FULL"]=data_loaded["AVLAND/BLDVOL"]/f
data_loaded["AVTOT/LOTAREA_FULL"]=data_loaded["AVTOT/LOTAREA"]/g
data_loaded["AVTOT/BLDAREA_FULL"]=data_loaded["AVTOT/BLDAREA"]/h
data_loaded["AVTOT/BLDVOL_FULL"]=data_loaded["AVTOT/BLDVOL"]/i

In [19]:
data_loaded.shape
data_loaded.head(5)

Unnamed: 0,BBLE,B,BLOCK,LOT,EASEMENT,OWNER,BLDGCL,TAXCLASS,LTFRONT,LTDEPTH,...,AVTOT/BLDVOL_B,FULLVAL/LOTAREA_FULL,FULLVAL/BLDAREA_FULL,FULLVAL/BLDVOL_FULL,AVLAND/LOTAREA_FULL,AVLAND/BLDAREA_FULL,AVLAND/BLDVOL_FULL,AVTOT/LOTAREA_FULL,AVTOT/BLDAREA_FULL,AVTOT/BLDVOL_FULL
0,1000010010,1,1,10,,UNITED STATES OF AMER,Y4,4,198,355,...,4.826073,17.322925,32.707568,1.501991,103.53734,123.55736,4.611258,56.113154,99.6815,4.72948
1,1000010101,1,1,101,,U S GOVT LAND & BLDGS,P7,4,500,1046,...,1.99628,0.163868,13.529315,0.621291,0.611028,31.885158,1.189979,0.530807,41.232733,1.956325
2,1000010201,1,1,201,,U S GOVT LAND & BLDGS,Z9,4,27,177,...,5.559789,162.404448,37.680152,1.730341,226.457979,33.208321,1.239361,526.067395,114.836235,5.448511
3,1000020001,1,2,1,,DEPT OF GENERAL SERVI,Y7,4,709,564,...,1.051342,1.048442,0.427513,0.327203,7.377703,1.901389,1.182689,3.396158,1.302915,1.0303
4,1000020023,1,2,23,,DEPARTMENT OF BUSINES,T2,4,793,551,...,5.041824,0.359288,1.366791,1.569138,2.640417,6.348579,5.92335,1.16382,4.165511,4.940913


In [20]:
df_45 = data_loaded.iloc[:,44:85].copy()


In [21]:
or_mydata = deepcopy(data)
or_mydata.shape

no_of_or_data = or_mydata.shape[0]
no_of_or_features = or_mydata.shape[1]
print('No of records is: ',str(no_of_or_data))
print('No of features is ',str(no_of_or_features))

No of records is:  663276
No of features is  31


In [22]:
mydata = deepcopy(df_45)
mydata.shape

no_of_data = mydata.shape[0]
no_of_features = mydata.shape[1]
print('No of data points is: ',str(no_of_data))
print('No of features is ',str(no_of_features))

No of data points is:  663276
No of features is  41


In [23]:
mydata.head(5)

Unnamed: 0,FULLVAL/LOTAREA_ZIP,FULLVAL/LOTAREA_ZIP3,FULLVAL/LOTAREA_TAXCLASS,FULLVAL/LOTAREA_B,FULLVAL/BLDAREA_ZIP,FULLVAL/BLDAREA_ZIP3,FULLVAL/BLDAREA_TAXCLASS,FULLVAL/BLDAREA_B,FULLVAL/BLDVOL_ZIP,FULLVAL/BLDVOL_ZIP3,...,AVTOT/BLDAREA_B,AVTOT/BLDVOL_ZIP,AVTOT/BLDVOL_ZIP3,AVTOT/BLDVOL_TAXCLASS,AVTOT/BLDVOL_B,FULLVAL/LOTAREA_FULL,FULLVAL/BLDAREA_FULL,FULLVAL/BLDVOL_FULL,AVLAND/LOTAREA_FULL,AVLAND/BLDAREA_FULL
0,13.11442,11.759842,16.736777,12.068192,35.442417,34.196628,20.924244,35.158922,2.931751,2.66335,...,58.855833,3.079311,4.592168,0.866274,4.826073,17.322925,32.707568,1.501991,103.53734,123.55736
1,0.124057,0.111243,0.158323,0.11416,14.660571,14.145257,8.655205,14.543305,1.212703,1.101681,...,24.345409,1.273741,1.899527,0.35833,1.99628,0.163868,13.529315,0.621291,0.611028,31.885158
2,122.949228,110.249899,156.909243,113.140713,40.830783,39.395595,24.10539,40.504188,3.377469,3.068263,...,67.803779,3.547463,5.290323,0.997975,5.559789,162.404448,37.680152,1.730341,226.457979,33.208321
3,0.793729,0.711745,1.012966,0.730408,0.46326,0.446977,0.273496,0.459554,0.638671,0.580201,...,0.769292,0.670816,1.000387,0.188715,1.051342,1.048442,0.427513,0.327203,7.377703,1.901389
4,0.272001,0.243906,0.347131,0.250301,1.481075,1.429016,0.874387,1.469228,3.062815,2.782415,...,2.45948,3.216972,4.797462,0.905,5.041824,0.359288,1.366791,1.569138,2.640417,6.348579


In [24]:
npX_train = np.array(df_45.values)
npX_train.shape

(663276, 41)

## Preparing for Dimensionality Reduction

In [25]:
from sklearn import preprocessing
X_norm1 = preprocessing.scale(npX_train)

In [26]:

# verify the features are normalized
npX_train_norm = np.array(X_norm1)
print('Normalized mean is: ',str(np.mean(npX_train_norm,axis=0)))
print('Normalized mean is: ',str(np.isclose(np.mean(npX_train_norm,axis=0),0)-1))
print('Normalized std is: ',str(np.std(npX_train_norm,axis=0)))

Normalized mean is:  [ 9.03074325e-17  1.74830047e-17 -7.04462250e-17 -1.40549646e-17
             nan             nan             nan             nan
             nan             nan             nan             nan
 -3.69290981e-17  1.86292557e-17  6.70181848e-17  7.60596407e-18
             nan             nan             nan             nan
             nan             nan             nan             nan
 -6.59254970e-17 -2.17680549e-17 -4.25505483e-17 -1.00698679e-17
             nan             nan             nan             nan
             nan             nan             nan             nan
 -2.45104870e-17             nan             nan -3.17522218e-17
             nan]
Normalized mean is:  [ 0  0  0  0 -1 -1 -1 -1 -1 -1 -1 -1  0  0  0  0 -1 -1 -1 -1 -1 -1 -1 -1
  0  0  0  0 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1  0 -1]
Normalized std is:  [ 1.  1.  1.  1. nan nan nan nan nan nan nan nan  1.  1.  1.  1. nan nan
 nan nan nan nan nan nan  1.  1.  1.  1. nan nan nan nan nan nan nan na

In [27]:
np.any(np.isfinite(npX_train))
npX_train

array([[1.31144204e+01, 1.17598423e+01, 1.67367767e+01, ...,
        1.50199091e+00, 1.03537340e+02, 1.23557360e+02],
       [1.24056964e-01, 1.11243218e-01, 1.58322949e-01, ...,
        6.21290714e-01, 6.11028464e-01, 3.18851578e+01],
       [1.22949228e+02, 1.10249899e+02, 1.56909243e+02, ...,
        1.73034095e+00, 2.26457979e+02, 3.32083214e+01],
       ...,
       [1.25056224e+00, 1.44034705e+00, 1.19592020e+00, ...,
        8.85773347e-01, 1.66137387e+00, 1.30904297e+00],
       [9.07092715e-01, 1.04475273e+00, 7.39546855e-01, ...,
        1.36598412e+00, 3.94370058e-01, 3.30321728e-01],
       [9.05045102e-01, 1.04239437e+00, 7.37877449e-01, ...,
                   nan, 4.31227431e-01,            nan]])

In [28]:
np.where(np.isnan(npX_train))

(array([663275, 663275, 663275, 663275, 663275, 663275, 663275, 663275,
        663275, 663275, 663275, 663275, 663275, 663275, 663275, 663275,
        663275, 663275, 663275, 663275, 663275, 663275, 663275, 663275,
        663275, 663275, 663275]),
 array([ 4,  5,  6,  7,  8,  9, 10, 11, 16, 17, 18, 19, 20, 21, 22, 23, 28,
        29, 30, 31, 32, 33, 34, 35, 37, 38, 40]))

In [29]:
# output the normazized features to dataframe
X_1st_norm_df = pd.DataFrame(npX_train_norm,columns=df_45.columns)
# X_1st_norm_df.to_csv('Normalized_Data_45.csv')
X_1st_norm_df

Unnamed: 0,FULLVAL/LOTAREA_ZIP,FULLVAL/LOTAREA_ZIP3,FULLVAL/LOTAREA_TAXCLASS,FULLVAL/LOTAREA_B,FULLVAL/BLDAREA_ZIP,FULLVAL/BLDAREA_ZIP3,FULLVAL/BLDAREA_TAXCLASS,FULLVAL/BLDAREA_B,FULLVAL/BLDVOL_ZIP,FULLVAL/BLDVOL_ZIP3,...,AVTOT/BLDAREA_B,AVTOT/BLDVOL_ZIP,AVTOT/BLDVOL_ZIP3,AVTOT/BLDVOL_TAXCLASS,AVTOT/BLDVOL_B,FULLVAL/LOTAREA_FULL,FULLVAL/BLDAREA_FULL,FULLVAL/BLDVOL_FULL,AVLAND/LOTAREA_FULL,AVLAND/BLDAREA_FULL
0,3.837426,3.794774,3.987503,3.993607,3.147842,2.031817,0.724495,1.994240,0.141714,0.058001,...,0.894175,0.085987,0.035447,-0.001301,0.037325,5.595244,1.676771,0.015397,6.476209,1.367653
1,-0.277468,-0.313446,-0.213270,-0.319627,1.248499,0.804562,0.278362,0.790675,0.015604,0.003546,...,0.360809,0.011320,0.008876,-0.006241,0.009719,-0.286613,0.662580,-0.011616,-0.024567,0.344656
2,38.629261,38.530183,39.505457,40.462427,3.640308,2.350022,0.840170,2.306304,0.174412,0.072120,...,1.032468,0.105347,0.042336,-0.000020,0.044482,55.326930,1.939733,0.022401,14.239817,0.359422
3,-0.065339,-0.101661,0.003286,-0.097274,-0.049055,-0.033848,-0.026417,-0.031552,-0.026507,-0.014638,...,-0.003566,-0.013613,0.000004,-0.007891,0.000501,0.016605,-0.030274,-0.020636,0.402813,0.010059
4,-0.230605,-0.266659,-0.165429,-0.270505,0.043968,0.026258,-0.004568,0.027394,0.151329,0.062152,...,0.022557,0.091680,0.037472,-0.000924,0.039429,-0.219626,0.019397,0.017457,0.103608,0.059686
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663271,-0.023815,0.022984,-0.062334,0.021355,-0.042003,-0.021308,-0.013782,-0.021971,0.003152,0.008018,...,-0.012824,-0.010229,-0.006424,0.006131,-0.005830,-0.029005,-0.011431,0.024531,-0.030610,-0.008047
663272,-0.193745,-0.194925,-0.173157,-0.200331,-0.038504,-0.018482,-0.012182,-0.019392,0.008572,0.011056,...,-0.012089,-0.001542,-0.005463,0.010557,-0.004734,-0.211018,-0.008495,0.028441,-0.044063,-0.006503
663273,0.079369,0.155301,0.049644,0.155965,0.005840,0.017337,-0.000466,0.013297,-0.035704,-0.013762,...,0.032932,0.101714,0.005962,-0.004757,0.008289,0.081515,0.028720,-0.003504,0.041772,0.003449
663274,-0.029430,0.015783,-0.065996,0.014030,-0.016420,-0.000644,-0.002086,-0.003112,-0.015289,-0.002319,...,-0.012382,-0.023181,-0.007857,-0.000468,-0.007463,-0.035020,0.010039,0.011226,-0.038251,-0.007473


In [30]:
np.where(np.isnan(X_1st_norm_df))
AA = X_1st_norm_df.drop(663275)
AA

Unnamed: 0,FULLVAL/LOTAREA_ZIP,FULLVAL/LOTAREA_ZIP3,FULLVAL/LOTAREA_TAXCLASS,FULLVAL/LOTAREA_B,FULLVAL/BLDAREA_ZIP,FULLVAL/BLDAREA_ZIP3,FULLVAL/BLDAREA_TAXCLASS,FULLVAL/BLDAREA_B,FULLVAL/BLDVOL_ZIP,FULLVAL/BLDVOL_ZIP3,...,AVTOT/BLDAREA_B,AVTOT/BLDVOL_ZIP,AVTOT/BLDVOL_ZIP3,AVTOT/BLDVOL_TAXCLASS,AVTOT/BLDVOL_B,FULLVAL/LOTAREA_FULL,FULLVAL/BLDAREA_FULL,FULLVAL/BLDVOL_FULL,AVLAND/LOTAREA_FULL,AVLAND/BLDAREA_FULL
0,3.837426,3.794774,3.987503,3.993607,3.147842,2.031817,0.724495,1.994240,0.141714,0.058001,...,0.894175,0.085987,0.035447,-0.001301,0.037325,5.595244,1.676771,0.015397,6.476209,1.367653
1,-0.277468,-0.313446,-0.213270,-0.319627,1.248499,0.804562,0.278362,0.790675,0.015604,0.003546,...,0.360809,0.011320,0.008876,-0.006241,0.009719,-0.286613,0.662580,-0.011616,-0.024567,0.344656
2,38.629261,38.530183,39.505457,40.462427,3.640308,2.350022,0.840170,2.306304,0.174412,0.072120,...,1.032468,0.105347,0.042336,-0.000020,0.044482,55.326930,1.939733,0.022401,14.239817,0.359422
3,-0.065339,-0.101661,0.003286,-0.097274,-0.049055,-0.033848,-0.026417,-0.031552,-0.026507,-0.014638,...,-0.003566,-0.013613,0.000004,-0.007891,0.000501,0.016605,-0.030274,-0.020636,0.402813,0.010059
4,-0.230605,-0.266659,-0.165429,-0.270505,0.043968,0.026258,-0.004568,0.027394,0.151329,0.062152,...,0.022557,0.091680,0.037472,-0.000924,0.039429,-0.219626,0.019397,0.017457,0.103608,0.059686
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663270,-0.023815,0.022984,-0.062334,0.021355,-0.042003,-0.021308,-0.013782,-0.021971,0.003152,0.008018,...,-0.012824,-0.010229,-0.006424,0.006131,-0.005830,-0.029005,-0.011431,0.024531,-0.030610,-0.008047
663271,-0.023815,0.022984,-0.062334,0.021355,-0.042003,-0.021308,-0.013782,-0.021971,0.003152,0.008018,...,-0.012824,-0.010229,-0.006424,0.006131,-0.005830,-0.029005,-0.011431,0.024531,-0.030610,-0.008047
663272,-0.193745,-0.194925,-0.173157,-0.200331,-0.038504,-0.018482,-0.012182,-0.019392,0.008572,0.011056,...,-0.012089,-0.001542,-0.005463,0.010557,-0.004734,-0.211018,-0.008495,0.028441,-0.044063,-0.006503
663273,0.079369,0.155301,0.049644,0.155965,0.005840,0.017337,-0.000466,0.013297,-0.035704,-0.013762,...,0.032932,0.101714,0.005962,-0.004757,0.008289,0.081515,0.028720,-0.003504,0.041772,0.003449


In [31]:
npX_train_norm = np.array(AA)


In [32]:
# Instantiate the model
clf = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, behaviour='deprecated', random_state=None, verbose=0)
# Fit the model
clf.fit(npX_train_norm)

IsolationForest(behaviour='deprecated', bootstrap=False, contamination='auto',
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=None, verbose=0, warm_start=False)

In [33]:
filename = 'anomaly_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [34]:
anomaly_model = pickle.load(open(filename, 'rb'))

In [35]:
# Predict the anomaly usign the model
y_anomaly = anomaly_model.predict(npX_train_norm)
y_scores= anomaly_model.decision_function(npX_train_norm)

In [36]:
# Convert the numpy array into Pandas dataframe
y_anom = pd.DataFrame(data=y_anomaly,columns=['Anomaly'])
y_score = pd.DataFrame(data=y_scores,columns=['Scores'])

# Rename dt_time column name with TimeStamp column name
#pdtrain_data.columns = npX_train_norm.columns.str.replace('dt_truncated','TimeStamp')

pdtrain_data = df_45
# Combine the two dataframes
dataframes = [pdtrain_data,y_anom,y_score]
scored_output = pd.concat(dataframes,axis=1)

# Show the first few rows
#scored_output[["FULLVAL/LOTAREA_TAXCLASS","FULLVAL/LOTAREA_ZIP","AVTOT/BLDVOL_FULL","Anomaly","Scores"]].head(20)
scored_output.head(20)

Unnamed: 0,FULLVAL/LOTAREA_ZIP,FULLVAL/LOTAREA_ZIP3,FULLVAL/LOTAREA_TAXCLASS,FULLVAL/LOTAREA_B,FULLVAL/BLDAREA_ZIP,FULLVAL/BLDAREA_ZIP3,FULLVAL/BLDAREA_TAXCLASS,FULLVAL/BLDAREA_B,FULLVAL/BLDVOL_ZIP,FULLVAL/BLDVOL_ZIP3,...,AVTOT/BLDVOL_ZIP3,AVTOT/BLDVOL_TAXCLASS,AVTOT/BLDVOL_B,FULLVAL/LOTAREA_FULL,FULLVAL/BLDAREA_FULL,FULLVAL/BLDVOL_FULL,AVLAND/LOTAREA_FULL,AVLAND/BLDAREA_FULL,Anomaly,Scores
0,13.11442,11.759842,16.736777,12.068192,35.442417,34.196628,20.924244,35.158922,2.931751,2.66335,...,4.592168,0.866274,4.826073,17.322925,32.707568,1.501991,103.53734,123.55736,-1.0,-0.320728
1,0.124057,0.111243,0.158323,0.11416,14.660571,14.145257,8.655205,14.543305,1.212703,1.101681,...,1.899527,0.35833,1.99628,0.163868,13.529315,0.621291,0.611028,31.885158,-1.0,-0.226139
2,122.949228,110.249899,156.909243,113.140713,40.830783,39.395595,24.10539,40.504188,3.377469,3.068263,...,5.290323,0.997975,5.559789,162.404448,37.680152,1.730341,226.457979,33.208321,-1.0,-0.310794
3,0.793729,0.711745,1.012966,0.730408,0.46326,0.446977,0.273496,0.459554,0.638671,0.580201,...,1.000387,0.188715,1.051342,1.048442,0.427513,0.327203,7.377703,1.901389,1.0,0.07048
4,0.272001,0.243906,0.347131,0.250301,1.481075,1.429016,0.874387,1.469228,3.062815,2.782415,...,4.797462,0.905,5.041824,0.359288,1.366791,1.569138,2.640417,6.348579,-1.0,-0.047104
5,2.02854,1.819014,2.588847,1.86671,94.982654,91.644046,56.07519,94.222911,392.842077,356.877485,...,615.33099,116.076966,646.673259,2.679512,87.653493,201.260376,22.494521,465.087645,-1.0,-0.342673
6,0.85255,0.76449,1.088034,0.784536,61.701227,59.53245,36.426736,61.207694,5.103845,4.63659,...,7.994444,1.508084,8.401646,1.126138,56.940166,2.614796,9.569916,305.829706,-1.0,-0.290038
7,0.386494,0.346573,0.493248,0.35566,49.419173,47.682106,29.175743,49.023881,204.394485,185.682222,...,320.154789,60.394482,336.462073,0.510522,45.605834,104.715134,4.338413,244.952219,-1.0,-0.338124
8,1.329638,1.192301,1.6969,1.223564,5.920253,5.712158,3.495157,5.872898,8.161927,7.414705,...,12.784493,2.411686,13.435679,1.756328,5.463428,4.181508,13.066245,25.689456,-1.0,-0.258452
9,0.079834,0.071588,0.101885,0.073465,0.057463,0.055444,0.033925,0.057004,0.004753,0.004318,...,0.007445,0.001405,0.007825,0.105454,0.053029,0.002435,0.321118,0.102062,1.0,0.13825
