In [1]:
import sys; sys.path.append('../')

from src.data_loader import load_data

import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit



# Preliminary ML Analysis

In [2]:
clusters, _ = load_data('../data/huge_sample_input_classified.txt')

In [3]:
train_percentage = 0.8

data = clusters.replace('None', np.nan).dropna(axis=1)\
    .drop([
        'file_origin', 'file_destination', 'confidence',
        'texture_code_origin', 'texture_code_destination'
    ], axis=1)

X = data.drop('classification', axis=1)
X = (X - X.mean()) / X.std()
y = data['classification']

train_indexes, test_indexes = next(StratifiedShuffleSplit(n_splits=1, train_size=0.8).split(X, y))
X_train, y_train = X.iloc[train_indexes], y.iloc[train_indexes]
X_test, y_test = X.iloc[test_indexes], y.iloc[test_indexes]

## SVM

In [4]:
model = SVC().fit(X_train, y_train)
classes = y_test.unique()
pd.DataFrame(confusion_matrix(y_test, model.predict(X_test), labels=classes), index=classes, columns=classes)



Unnamed: 0,Vegetation,Limit_effect,Spread,Unknow,Candidate
Vegetation,1022,34,0,0,0
Limit_effect,44,85,0,0,0
Spread,0,1,0,0,0
Unknow,2,0,0,2,0
Candidate,0,2,0,0,0


## Naive Bayes

In [5]:
model = BernoulliNB().fit(X_train, y_train)
classes = y_test.unique()
pd.DataFrame(confusion_matrix(y_test, model.predict(X_test), labels=classes), index=classes, columns=classes)

Unnamed: 0,Vegetation,Limit_effect,Spread,Unknow,Candidate
Vegetation,809,208,0,3,36
Limit_effect,15,111,0,0,3
Spread,0,1,0,0,0
Unknow,0,1,0,2,1
Candidate,0,2,0,0,0


## MLP

In [6]:
model = MLPClassifier().fit(X_train, y_train)
classes = y_test.unique()
pd.DataFrame(confusion_matrix(y_test, model.predict(X_test), labels=classes), index=classes, columns=classes)



Unnamed: 0,Vegetation,Limit_effect,Spread,Unknow,Candidate
Vegetation,1015,41,0,0,0
Limit_effect,50,79,0,0,0
Spread,0,1,0,0,0
Unknow,2,1,0,1,0
Candidate,0,2,0,0,0
