# Anomaly Detection Using Pycaret

# Install Pycaret Library

In [15]:
!pip install pycaret



# Loading Dataset

*   Mount drive and load the dataset the dataset using Pandas library.
*   Dataset used for analysis is Kaggle's Water Patability dataset - https://www.kaggle.com/datasets/guru001/movie-genre-prediction

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import pandas as pd
url = "/content/drive/MyDrive/DM-Assignment-Dataset/winequalityN.csv"
df = pd.read_csv(url)

In [17]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


# Buidl Model Using Pycaret

In [18]:
from pycaret.anomaly import *
s = setup(df, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(6497, 13)"
2,Transformed data shape,"(6497, 13)"
3,Ordinal features,1
4,Numeric features,12
5,Categorical features,1
6,Rows with missing values,0.5%
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [19]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pycaret.internal.patches.pyod.CBLOFForceToDouble
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [20]:
# train cluster model
cluster = create_model('cluster')
cluster

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

CBLOFForceToDouble(alpha=0.9, beta=5, check_estimator=False,
          clustering_estimator=None, contamination=0.05, n_clusters=8,
          n_jobs=None, random_state=123, use_weights=False)

In [21]:
cluster_anomalies = assign_model(cluster)
cluster_anomalies

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Anomaly,Anomaly_Score
0,white,7.0,0.270,0.36,20.700001,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,0,21.324889
1,white,6.3,0.300,0.34,1.600000,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,0,19.428704
2,white,8.1,0.280,0.40,6.900000,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,0,6.782942
3,white,7.2,0.230,0.32,8.500000,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,1.913135
4,white,7.2,0.230,0.32,8.500000,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,1.913135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.000000,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,0,20.293282
6493,red,5.9,0.550,0.10,2.200000,0.062,39.0,51.0,0.99512,3.52,,11.2,6,0,21.476429
6494,red,6.3,0.510,0.13,2.300000,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,0,22.114488
6495,red,5.9,0.645,0.12,2.000000,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,0,20.324368


# Analyse The Model

In [22]:
evaluate_model(cluster)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [23]:
# tsne plot anomalies
plot_model(cluster, plot = 'tsne')

# Saving The Model

In [13]:
# save pipeline
save_model(cluster, 'cluster_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Row ID', 'Postal Code', 'Sales',
                                              'Quantity', 'Discount', 'Profit'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['Order ID', 'Order Date',
                                              'Ship Date', 'Ship Mode',
                                              'Customer ID', 'Customer Name',
                                              'Segment', 'Country', 'City',
                                              'Sta...
                                                                     'Customer '
                                                                     'Name',
                                                                     'Segment',
                                                  

# Loading The Model

In [14]:
# load pipeline
loaded_cluster_pipeline = load_model('cluster_pipeline')
loaded_cluster_pipeline

Transformation Pipeline and Model Successfully Loaded
