# Analysis of Original Features


## Import Necessary Libraries

In [None]:
import seaborn as sns
from pathlib import PosixPath
from CU_Dataset_Factory import CU_Dataset_Factory
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, f1_score, recall_score

from sklearn import tree
from sklearn.linear_model import SGDClassifier

from matplotlib import pyplot as plt
from sklearn.decomposition import SparsePCA, KernelPCA

## Data Preparation


In [None]:
builder = CU_Dataset_Factory(out_dir=PosixPath('./dataset'), target_feature='label', batch_size=25, features_enable=['category','type','label'], encoding=True)

dataset_tr = builder.produce(True)
dataset_te = builder.produce(False)

## Split the Dataset

In [None]:
y_train = dataset_tr['label'].to_numpy()
y_test  = dataset_te['label'].to_numpy()

In [None]:
X_train = dataset_tr.drop(labels='label', axis=1)
X_test  = dataset_te.drop(labels='label', axis=1)

In [None]:
X_test

In [None]:
y_test

# Models

We define a series of basic models to analize the existing features in the dataset and to identify how discriminant they are

## Decision Tree

In [None]:
dtc = tree.DecisionTreeClassifier()

In [None]:
estimator = dtc.fit(X_train, y_train)

In [None]:
estimator.get_depth()

### Evalute the Model

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
cnf = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
#auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
f1  = f1_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print(f'test accuracy: {acc}')
#print(f'test roc auc score: {auc}')
print(f'test F1 score: {f1}')
print(f'test Recall: {recall}')

In [None]:
sns.heatmap(confusion_matrix(y_pred, y_test))

## SVM

In [None]:
svm = SGDClassifier()

In [None]:
estimator = svm.fit(X_train, y_train)

### Evalute the Model

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
cnf = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
#auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
f1  = f1_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print(f'test accuracy: {acc}')
#print(f'test roc auc score: {auc}')
print(f'test F1 score: {f1}')
print(f'test Recall: {recall}')

In [None]:
sns.heatmap(confusion_matrix(y_pred, y_test))

## Results

- The feature **'subcategory'** causes the models to reach a certain level of overfitting
- The feature **'category'** is decisive for the classification

## PCA

Reduction of the high dimensionality induced by one-hot encoding

In [None]:
pca = SparsePCA(n_components=2, n_jobs=-1, verbose=False)

In [None]:
X_train = pca.fit_transform(X_train, y_train)
X_test = pca.fit_transform(X_test, y_test)

In [None]:
X_train

In [None]:
plt.scatter(X_train[:, 0], X_train[:,1])

In [None]:
plt.scatter(X_test[:, 0], X_test[:,1])

## PCA + Decision Tree

In [None]:
estimator = dtc.fit(X_train, y_train)

In [None]:
estimator = svm.fit(X_train, y_train)

In [None]:
cnf = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
#auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
f1  = f1_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print(f'test accuracy: {acc}')
#print(f'test roc auc score: {auc}')
print(f'test F1 score: {f1}')
print(f'test Recall: {recall}')

In [None]:
sns.heatmap(confusion_matrix(y_pred, y_test))

In [None]:
print(f'PCA + Decision Tree score: {estimator.score(X_test, y_test)}')

## PCA + SVM

In [None]:
estimator = svm.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
cnf = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
#auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
f1  = f1_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print(f'test accuracy: {acc}')
#print(f'test roc auc score: {auc}')
print(f'test F1 score: {f1}')
print(f'test Recall: {recall}')

In [None]:
sns.heatmap(confusion_matrix(y_pred, y_test))

## Final Results

Acceptable accuracy values after PCA: it allows us to overcome the problem of sparsity of data (yielded by one-hot encoding) without a significant information loss.

A possible improvement could be the employment of neural networks to be able to learn non-linear relations between initial input and projected value.