# Praca domowa 7

## Zbiór danych

Poniższe zbiory zostały wyodrębnione ze zbioru [Wine UCI](https://archive.ics.uci.edu/ml/datasets/wine):


- train - bez próbek odstających, do trenowania modelu
- test - do oceny skuteczności modelu, dodana informacja o klasie 0=inliers, 1=outliers
- val - analogiczny do test, ale bez klasy

## Treść zadania

Celem zadania jest wykorzytanie algorytmu GMM do wykrywania próbek odstających.

Do oceny modelu wykorzystać metryki F1 score, Precision i Recall.


In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.decomposition import PCA
from math import floor, ceil

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def print_result(truth, pred):
  print(f"Precision score: {precision_score(truth, pred)}")
  print(f"F1 score:        {f1_score(truth, pred)}")
  print(f"Recall score:    {recall_score(truth, pred)}")

In [None]:
test = pd.read_csv("https://raw.githubusercontent.com/mini-pw/2021L-WUM/main/Prace_domowe/Praca_domowa7/test.csv")
X_train = pd.read_csv("https://raw.githubusercontent.com/mini-pw/2021L-WUM/main/Prace_domowe/Praca_domowa7/train.csv")
y_train = [0]*X_train.shape[0]
X_val = pd.read_csv("https://raw.githubusercontent.com/mini-pw/2021L-WUM/main/Prace_domowe/Praca_domowa7/val.csv")
y_test = test['class']
X_test = test.drop(['class'], axis=1)

In [None]:
print(test.shape)
print(X_train.shape)
print(X_val.shape)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
covariances=['full', 'tied', 'diag', 'spherical']
fig, ax = plt.subplots(2,2, figsize=(16,16))
for i in range(2):
  for j in range(2):
    GMM = GaussianMixture(n_components=2, covariance_type=covariances[j+2*i], tol=0.001)
    GMM.fit(X_train)
    score_train = GMM.score_samples(X_train)
    sns.boxplot(x=score_train, ax=ax[i][j])
    ax[i][j].title.set_text(covariances[j+2*i])
plt.suptitle('Examples of different covariances types in GMM')
plt.show()

In [None]:
def test_covariances(n_components):
  for cov in covariances:
    print("covariance type: " + cov)
    tmp_GMM = GaussianMixture(n_components=n_components, covariance_type=cov)
    tmp_GMM.fit(X_train)
    tmp_score_train = tmp_GMM.score_samples(X_train)
    tmp_score_test = tmp_GMM.score_samples(X_test)
    tmp_simple_threshold = floor(min(tmp_score_train)), ceil(max(tmp_score_train))
    tmp_y_test_pred = [1 if (i<tmp_simple_threshold[0] or i>tmp_simple_threshold[1]) else 0 for i in tmp_score_test]
    print('Prediction:     ', tmp_y_test_pred)
    print('Ground Truth:   ', list(y_test))
    print_result(y_test, tmp_y_test_pred)
    print()

In [None]:
test_covariances(2)

In [None]:
test_covariances(4)

In [None]:
GMM1 = GaussianMixture(n_components=2, covariance_type='tied').fit(X_train)
GMM2 = GaussianMixture(n_components=4, covariance_type='diag').fit(X_train)

In [None]:
score1_train = GMM1.score_samples(X_train)
score2_train = GMM2.score_samples(X_train)
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16,10))
sns.boxplot(x=score1_train, ax=ax1)
sns.boxplot(x=score2_train, ax=ax2)
ax1.set_title('Score of train samples using GMM1')
ax2.set_title('Score of train samples using GMM2')
plt.show()

Since the train dataset is the true dataset used to fitting, we assume all of its elements are the inliers, when setting the first threshold.

In [None]:
simple_threshold1 = floor(min(score1_train)), ceil(max(score1_train))
simple_threshold2 = floor(min(score2_train)), ceil(max(score2_train))
print(f"Threshold1 is equal to: {simple_threshold1}")
print(f"Threshold2 is equal to: {simple_threshold2}")

In [None]:
score1_val = GMM1.score_samples(X_val)
y_val1 = [1 if (i<simple_threshold1[0] or i>simple_threshold1[1]) else 0 for i in score1_val]
score2_val = GMM2.score_samples(X_val)
y_val2 = [1 if (i<simple_threshold2[0] or i>simple_threshold2[1]) else 0 for i in score2_val]
print('Prediction1:  ', y_val1)
print('Prediction2:  ', y_val2)
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16,10))
sns.boxplot(x=score1_val, ax=ax1)
sns.boxplot(x=score2_val, ax=ax2)
ax1.set_title('Score of val samples using GMM1')
ax2.set_title('Score of val samples using GMM2')
plt.show()

In [None]:
val_outliers1 = score1_val[np.logical_or(score1_val < simple_threshold1[0], score1_val > simple_threshold1[1])]
val_outliers2 = score2_val[np.logical_or(score2_val < simple_threshold2[0], score2_val > simple_threshold2[1])]
print(len(val_outliers1), 'outliers, out of', X_val.shape[0], 'elements in validation set on gmm1')
print(len(val_outliers2), 'outliers, out of', X_val.shape[0], 'elements in validation set on gmm2')

In [None]:
score1_test = GMM1.score_samples(X_test)
y_test_pred1 = [1 if (i<simple_threshold1[0] or i>simple_threshold1[1]) else 0 for i in score1_test]
score2_test = GMM2.score_samples(X_test)
y_test_pred2 = [1 if (i<simple_threshold2[0] or i>simple_threshold2[1]) else 0 for i in score2_test]

f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16,10))
sns.boxplot(x=score1_test, ax=ax1)
sns.boxplot(x=score2_test, ax=ax2)
ax1.set_title('Score of test samples using GMM1')
ax2.set_title('Score of test samples using GMM2')
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(X_train)

In [None]:
labels_dict = {0:'inlier', 1:'outlier'}
labels_dict_true = {0:'true inlier', 1:'true outlier'}
labels_test1 = pd.Series(y_test_pred1).map(labels_dict)
labels_val1 = pd.Series(y_val1).map(labels_dict)
labels_test2 = pd.Series(y_test_pred2).map(labels_dict)
labels_val2 = pd.Series(y_val2).map(labels_dict)
labels_test_true = pd.Series(y_test).map(labels_dict_true)

In [None]:
test_pca = pca.transform(X_test)
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(21,10))
sns.scatterplot(x=test_pca[:,0], y=test_pca[:,1], hue = labels_test1, ax = ax1)
sns.scatterplot(x=test_pca[:,0], y=test_pca[:,1], hue = labels_test2, ax = ax2)
sns.scatterplot(x=test_pca[:,0], y=test_pca[:,1], hue = labels_test_true, ax = ax3)
ax1.set_title('GMM1 on test set')
ax2.set_title('GMM2 on test set')
ax3.set_title('Ground Truth test set')
plt.show()

In [None]:
val_pca = pca.transform(X_val)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16,10))
sns.scatterplot(x=val_pca[:,0], y=val_pca[:,1], hue = labels_val1, ax = ax1)
sns.scatterplot(x=val_pca[:,0], y=val_pca[:,1], hue = labels_val2, ax = ax2)
ax1.set_title('GMM1 on validation set')
ax2.set_title('GMM2 on validation set')
plt.show()

## Podsumowanie

Modele GMM mogą być używane do wykrywania outlierów. Na przykładowym zbiorze danych otrzymaliśmy doskonałe wyniki wykrywania zarówno outlierów jak i inlierów. Jednak dla pewności należałoby sprawdzić wyniki dla większych zbiorów danych. Wyniki otrzymane w tym przykładzie wydają się być zbyt doskonałe by były prawdziwe. Istotną kwestią dodatkowo jest ustawienie threshold jaki uznamy za outlier. Utrudnia to zadanie wykrywania outlierów tą metodą.