In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## ANOMALY DETECTION PART 3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [1]:
#=================================================-
#### Slide 26: Loading packages  ####

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest




In [None]:
#=================================================-
#### Slide 27: Directory settings  ####

# Set 'main_dir' to location of the project folder
from pathlib import Path 
data_dir = Path.cwd() / "data"




In [28]:
#=================================================-
#### Slide 28: Load the dataset  ####

from pathlib import Path 
data_dir = Path.cwd().parent / "data"
paysim = pd.read_csv(data_dir / "paysim_transactions.csv")
paysim.head()




Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,308,CASH_OUT,94270.99,C473084216,0.0,0.0,C1130422007,392411.08,486682.07,0,0
1,215,TRANSFER,1068883.0,C116497934,227.3,0.0,C321541184,4096905.34,5165788.35,0,0
2,326,TRANSFER,2485281.21,C2089305953,54940.0,0.0,C1473680645,177829.59,2663110.8,0,0
3,371,PAYMENT,2243.36,C1200351076,0.0,0.0,M1583169191,0.0,0.0,0,0
4,283,PAYMENT,5845.82,C1501258365,0.0,0.0,M1431167213,0.0,0.0,0,0


In [29]:
#=================================================-
#### Slide 29: Prepare the dataset for modeling  ####

# Drop columns. 
paysim = paysim.drop(['step', 'type','nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)
paysim.columns




Index(['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFraud'],
      dtype='object')

In [37]:
#================================================= -
#### Slide 30: LOF on fraud dataset  ####

train, test = train_test_split(paysim, test_size=.30, random_state=42)

# Split fraud vs non fraud.
non_fraud = train[train['isFraud'] == 0] 
fraud = train[train['isFraud'] == 1]
non_fraud = non_fraud.drop('isFraud', axis=1)
concat_df = pd.concat([test, fraud])

In [34]:
actual_test = test['isFraud']

  test = test.append(fraud)


In [None]:
#=================================================-
#### Slide 31: Create and fit LOF model  ####

lof = LocalOutlierFactor(n_neighbors = 5, 
                         metric = "manhattan", 
                         contamination = 0.1, 
                         novelty = True)

# model fitting
lof.fit(non_fraud)




In [None]:
#=================================================-
#### Slide 32: Test predictions  ####

fraud_pred = lof.predict(test.iloc[:,:-1])
fraud_pred
fraud_pred[fraud_pred == 1] = 0
fraud_pred[fraud_pred == -1] = 1




In [None]:
#================================================= -
#### Slide 33: Find TPR and TNR  ####

tn, fp, fn, tp = confusion_matrix(actual_test, fraud_pred).ravel()
non_fraud_eval = tn / (tn + fp)
print(non_fraud_eval)
fraud_eval = tp / (tp + fn)
print(fraud_eval)




In [None]:
#=================================================-
#### Slide 34: Exercise 1  ####






In [None]:
#================================================= -
#### Slide 41: Optimized LOF model  ####

lof = LocalOutlierFactor(n_neighbors = 10, 
                         metric = "manhattan", 
                         contamination = 0.1, 
                         novelty = True)

# model fitting
lof.fit(non_fraud)




In [None]:
#=================================================-
#### Slide 42: Test predictions  ####

fraud_pred = lof.predict(test.iloc[:,:-1])
fraud_pred[fraud_pred == 1] = 0
fraud_pred[fraud_pred == -1] = 1
tn, fp, fn, tp = confusion_matrix(actual_test, fraud_pred).ravel()
non_fraud_eval = tn / (tn + fp)
print(non_fraud_eval)
fraud_eval = tp / (tp + fn)
print(fraud_eval)




In [None]:
#=================================================-
#### Slide 43: Load performance_df dataframe  ####

performance_df = pickle.load(open(str(data_dir)+"/performance_anomalies.sav","rb"))  
s = pd.Series(['LOF', fraud_eval, non_fraud_eval], 
              index=['model_name', 'TPR', 'TNR'])
performance_df = performance_df.append(s, ignore_index = True)
performance_df




In [None]:
#=================================================-
#### Slide 45: Exercise 2  ####






In [None]:
#=================================================-
#### Slide 47: Save results as a pickle  ####

pickle.dump(non_fraud, open(str(data_dir) + '/non_fraud.sav', 'wb'))
pickle.dump(test, open(str(data_dir) + '/test.sav', 'wb'))
pickle.dump(actual_test, open(str(data_dir) + '/actual_test.sav', 'wb'))
pickle.dump(performance_df, open(str(data_dir) + '/performance_anomalies.sav', 'wb'))


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################


In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## ANOMALY DETECTION PART 4 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 2: Loading packages  ####

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest




In [None]:
#=================================================-
#### Slide 3: Directory settings  ####

# Set 'main_dir' to location of the project folder
from pathlib import Path 
home_dir = Path(".").resolve()
main_dir = home_dir.parent.parent
print(main_dir)
data_dir = str(main_dir) + "/data"
print(data_dir)




In [None]:
#=================================================-
#### Slide 4: Load pickled data from previous module  ####

non_fraud = pickle.load(open((data_dir + "/non_fraud.sav"),"rb"))
test = pickle.load(open((data_dir + "/test.sav"),"rb"))
actual_test = pickle.load(open((data_dir + "/actual_test.sav"),"rb"))
performance_df = pickle.load(open((data_dir + "/performance_anomalies.sav"),"rb"))




In [None]:
#=================================================-
#### Slide 5: Data: load energy consumption  ####

pjm_energy = pd.read_csv(str(data_dir)+"/PJME_hourly.csv")
pjm_energy.head()




In [None]:
#=================================================-
#### Slide 6: Data: preprocessing  ####

pjm_energy['Datetime'] = pd.to_datetime(pjm_energy['Datetime'])
pjm_energy.info()
pjm_energy = pjm_energy[pjm_energy['Datetime'] > '2018-01-01 00:00:00']
pjm_energy.shape




In [None]:
#=================================================-
#### Slide 9: Create and fit LOF model: energy consumption  ####

lof_energy_model = LocalOutlierFactor(n_neighbors = 50, 
                                      metric = "manhattan", 
                                      contamination = 0.01, 
                                      novelty = False)
                                      
pjm_energy['anomaly'] = lof_energy_model.fit_predict(pd.DataFrame(pjm_energy['PJME_MW']))




In [None]:
#=================================================-
#### Slide 11: LOF - visualize anomalies  ####

lower_threshold = pjm_energy['PJME_MW'].quantile(0.25)
upper_threshold = pjm_energy['PJME_MW'].quantile(0.75)
lof_anomalies = pjm_energy[pjm_energy['anomaly'] == -1]
# Upper range of anomalies
lof_upper_anomalies = lof_anomalies[lof_anomalies['PJME_MW'] > upper_threshold]['PJME_MW']

# Lower range of anomalies
lof_lower_anomalies = lof_anomalies[lof_anomalies['PJME_MW'] < lower_threshold]['PJME_MW']




In [None]:
#=================================================-
#### Slide 13: Exercise 3  ####






In [None]:
#=================================================-
#### Slide 24: Create and fit isolation forest model  ####

iforest = IsolationForest(n_estimators=100, contamination = 0.1)

# model fitting
iforest.fit(non_fraud)




In [None]:
#=================================================-
#### Slide 25: Test predictions  ####

fraud_pred = iforest.predict(test.iloc[:,:-1])
fraud_pred
fraud_pred[fraud_pred == 1] = 0
fraud_pred[fraud_pred == -1] = 1




In [None]:
#================================================= -
#### Slide 26: Find TPR and TNR  ####

tn, fp, fn, tp = confusion_matrix(actual_test, fraud_pred).ravel()
non_fraud_eval = tn / (tn + fp)
print(non_fraud_eval)
fraud_eval = tp / (tp + fn)
print(fraud_eval)




In [None]:
#=================================================-
#### Slide 27: Load performance_df dataframe  ####

s = pd.Series(['Isolation Forest', fraud_eval, non_fraud_eval], 
              index=['model_name', 'TPR', 'TNR'])
performance_df = performance_df.append(s, ignore_index = True)
performance_df




In [None]:
#=================================================-
#### Slide 30: Exercise 4  ####






In [None]:
#=================================================-
#### Slide 32: Isolation forest on time series data  ####

isolation_energy = IsolationForest(n_estimators=100, contamination = 0.01)

# model fitting
isolation_energy.fit(pd.DataFrame(pjm_energy['PJME_MW']))
pjm_energy['anomaly'] = isolation_energy.predict(pd.DataFrame(pjm_energy['PJME_MW']))




In [None]:
#=================================================-
#### Slide 34: Isolation forest - visualize anomalies  ####

# visualization
lower_threshold = pjm_energy['PJME_MW'].quantile(0.25)
upper_threshold = pjm_energy['PJME_MW'].quantile(0.75)
if_anomalies = pjm_energy[pjm_energy['anomaly'] == -1]

if_upper_anomalies = if_anomalies[if_anomalies['PJME_MW'] > upper_threshold]['PJME_MW']
if_lower_anomalies = if_anomalies[if_anomalies['PJME_MW'] < lower_threshold]['PJME_MW']




In [None]:
#=================================================-
#### Slide 37: Exercise 5  ####




#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################
