-
Notifications
You must be signed in to change notification settings - Fork 818
/
oml4sql-anomaly-detection-em.sql
167 lines (148 loc) · 6.77 KB
/
oml4sql-anomaly-detection-em.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
-----------------------------------------------------------------------
-- Oracle Machine Learning for SQL (OML4SQL) 23c
--
-- Expectation Maximization - EM Algorithm for Anomaly Detection
--
-- Copyright (c) 2023 Oracle Corporation and/or its affilitiates.
--
-- The Universal Permissive License (UPL), Version 1.0
--
-- https://oss.oracle.com/licenses/upl
-----------------------------------------------------------------------
SET ECHO ON
SET FEEDBACK 1
SET NUMWIDTH 10
SET LINESIZE 80
SET TRIMSPOOL ON
SET TAB OFF
SET PAGESIZE 100
SET serveroutput ON
SET pages 10000
-----------------------------------------------------------------------
-- SAMPLE PROBLEM
-----------------------------------------------------------------------
-- Segment the demographic data into clusters and examine the anomalies.
-----------------------------------------------------------------------
-- EXAMPLE IN THIS SCRIPT
-----------------------------------------------------------------------
-- Create EM model with CREATE MODEL2
-- View model details
-- View and sort anomalous customers
-- View prediction details
-----------------------------------------------------------------------
-- In this script, we are using an EM classification model to expand on
-- the One-Class SVM model approach for anomaly detection. EM
-- can capture the underlying data distribution and thus flag records
-- that do not fit the learned data distribution well. An object is
-- identified as an outlier in an EM Anomaly Detection model if its anomaly
-- probability is greater than 0.5. A label of 1 denotes normal, while
-- a label of 0 denotes anomaly. The customer and demographics data is
-- used to predict anomalous customers using prob_anomalous.
-----------------------------------------------------------------------
-- SET UP AND ANALYZE THE DATA
-----------------------------------------------------------------------
------------------------------
-- CREATE VIEW DEMOGRAPHICS_V
--
CREATE OR REPLACE VIEW DEMOGRAPHICS_V AS
SELECT CUST_ID, YRS_RESIDENCE, EDUCATION, AFFINITY_CARD,
HOUSEHOLD_SIZE, OCCUPATION, BOOKKEEPING_APPLICATION,
BULK_PACK_DISKETTES, FLAT_PANEL_MONITOR, HOME_THEATER_PACKAGE,
OS_DOC_SET_KANJI, PRINTER_SUPPLIES, Y_BOX_GAMES
FROM SH.SUPPLEMENTARY_DEMOGRAPHICS;
-------------------------
-- CREATE VIEW JOINING CUSTOMERS AND DEMOGRAPHICS_V
--
CREATE OR REPLACE VIEW CUSTOMERS360_V AS
SELECT a.CUST_ID, a.CUST_GENDER, a.CUST_MARITAL_STATUS, a.CUST_YEAR_OF_BIRTH,
a.CUST_INCOME_LEVEL, a.CUST_CREDIT_LIMIT, b.EDUCATION, b.AFFINITY_CARD,
b.HOUSEHOLD_SIZE, b.OCCUPATION, b.YRS_RESIDENCE, b.Y_BOX_GAMES
FROM SH.CUSTOMERS a, DEMOGRAPHICS_V b
WHERE a.CUST_ID = b.CUST_ID;
-----------------------------------------------------------------------
-- BUILD THE MODEL
-----------------------------------------------------------------------
-------------------------
-- SET OUTLIER RATE IN SETTINGS TABLE - DEFAULT IS 0.05
--
BEGIN DBMS_DATA_MINING.DROP_MODEL('CUSTOMERS360MODEL_AD');
EXCEPTION WHEN OTHERS THEN NULL; END;
/
DECLARE
v_setlst DBMS_DATA_MINING.SETTING_LIST;
BEGIN
v_setlst('ALGO_NAME') := 'ALGO_EXPECTATION_MAXIMIZATION';
v_setlst('PREP_AUTO') := 'ON';
v_setlst('EMCS_OUTLIER_RATE') := '0.1';
DBMS_DATA_MINING.CREATE_MODEL2(
MODEL_NAME => 'CUSTOMERS360MODEL_AD',
MINING_FUNCTION => 'CLASSIFICATION',
DATA_QUERY => 'SELECT * FROM CUSTOMERS360_V',
CASE_ID_COLUMN_NAME => 'CUST_ID',
SET_LIST => v_setlst,
TARGET_COLUMN_NAME => NULL); -- NULL target indicates anomaly detection
END;
/
-----------------------------------------------------------------------
-- EXAMINE THE MODEL
-----------------------------------------------------------------------
-------------------------
-- DISPLAY MODEL DETAILS
--
SELECT *
FROM TABLE(dbms_data_mining.get_model_details_global('CUSTOMERS360MODEL_AD'))
ORDER BY global_detail_name;
---------------------------------------------
-- DISPLAY THE TOP 5 MOST ANOMALOUS CUSTOMERS
--
SELECT *
FROM (SELECT CUST_ID, round(prob_anomalous,2) prob_anomalous,
YRS_RESIDENCE, CUST_MARITAL_STATUS,
rank() over (ORDER BY prob_anomalous DESC) rnk
FROM (SELECT CUST_ID, HOUSEHOLD_SIZE, YRS_RESIDENCE, CUST_GENDER, CUST_MARITAL_STATUS,
prediction_probability(CUSTOMERS360MODEL, '0' USING *) prob_anomalous
FROM CUSTOMERS360_V))
WHERE rnk <= 5
ORDER BY prob_anomalous DESC;
---------------------------------------------------------------
-- CREATE VIEW OF CUSTOMERS IN DESCENDING ORDER OF ANOMALY PROBABILITY
--
CREATE OR REPLACE VIEW EM_ANOMALOUS_RESULTS AS
SELECT *
FROM (SELECT CUST_ID, anomalous, round(prob_anomalous,2) prob_anomalous,
YRS_RESIDENCE, HOUSEHOLD_SIZE, CUST_GENDER,
CUST_MARITAL_STATUS,
RANK() OVER (ORDER BY prob_anomalous DESC) rnk
FROM (SELECT CUST_ID, HOUSEHOLD_SIZE, YRS_RESIDENCE,
CUST_GENDER, CUST_MARITAL_STATUS,
prediction(CUSTOMERS360MODEL_AD using *) anomalous,
prediction_probability(CUSTOMERS360MODEL_AD, '0' USING *) prob_anomalous
FROM CUSTOMERS360_V))
ORDER BY prob_anomalous DESC;
SELECT *
FROM EM_ANOMALOUS_RESULTS
FETCH FIRST 10 ROWS ONLY;
--------------------------------------------------------------------
-- VIEW PREDICTION DETAILS OF TOP 3 ATTRIBUTES TO EXPLAIN PREDICTION
--
SELECT CUST_ID, PREDICTION,
RTRIM(TRIM(SUBSTR(OUTPRED."Attribute1",17,100)),'rank="1"/>') FIRST_ATTRIBUTE,
RTRIM(TRIM(SUBSTR(OUTPRED."Attribute2",17,100)),'rank="2"/>') SECOND_ATTRIBUTE,
RTRIM(TRIM(SUBSTR(OUTPRED."Attribute3",17,100)),'rank="3"/>') THIRD_ATTRIBUTE
FROM (SELECT CUST_ID,
PREDICTION(CUSTOMERS360MODEL USING *) PREDICTION,
PREDICTION_DETAILS(CUSTOMERS360MODEL, '0' USING *) PREDICTION_DETAILS
FROM CUSTOMERS360_V
WHERE PREDICTION_PROBABILITY(CUSTOMERS360MODEL, '0' USING *) > 0.50
AND OCCUPATION = 'TechSup'
ORDER BY CUST_ID) OUT,
XMLTABLE('/Details'
PASSING OUT.PREDICTION_DETAILS
COLUMNS
"Attribute1" XMLType PATH 'Attribute[1]',
"Attribute2" XMLType PATH 'Attribute[2]',
"Attribute3" XMLType PATH 'Attribute[3]') OUTPRED
FETCH FIRST 10 ROWS ONLY;
-----------------------------------------------------------------------
-- End of script
-----------------------------------------------------------------------