In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import tree
from scipy.stats import iqr
from scipy.stats import skew
from scipy.stats import kurtosis
from sklearn.model_selection import GridSearchCV

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier

In [12]:
from ipynb.fs.defs.BOS_Wrapper import BOS_Classifier
from ipynb.fs.defs.NLP_Classifier import NLP_Classifier

In [13]:
from heapq import nlargest

In [14]:
class EnsembleFramework:
    
    def __init__(self, criterion = 'topk', layers = [{'type' : 'NLP'}, {'type' : 'BOS', 'name' : 'DecisionTreeClassifier()'}], params = {'k' : [4, 1]}):
        self.criterion = criterion
        self.layers = layers
        self.params = params
        
    def run(self, X_train, y_train, X_test):
        classes = [list(set(y_train))] * len(X_test)
        self.classifiers = []
        for classifier in self.layers:
            if (classifier['type'] == 'BOS'):
                clf = BOS_Classifier(eval(classifier['name']))
            elif (classifier['type'] == 'NLP'):
                clf = NLP_Classifier()
            clf.fit(X_train, y_train)
            self.classifiers.append(clf)
            
        if (self.criterion == 'topk'):
            for clf, k in zip(self.classifiers, self.params['k']):
                probs = clf.predict_proba(X_test)
                for i, series_probs in enumerate(probs):
                    classes[i] = nlargest(k, classes[i], key = lambda x : series_probs[x])
        
        return classes  
    
    def accuracy(self, y_pred, y_test):
        classes = []
        for pred in y_pred:
            classes.append(*pred)
            
        return accuracy_score(classes, y_test)*100

In [15]:
y = []
X = {'metadata': [], 'statistics' : []}
X_Super = []

path = ""

# calcoliamo le features di ogni timeseries

with open(path + 'ThingspeakEU.meta.csv', 'r', encoding='utf-8') as dati:
    for row in dati:
        riga = row.strip().split(',')
        
        classe = int(riga[8])
        y.append(classe)
        
        # metadati
        stream_name = riga[1]
        X['metadata'].append(stream_name)
        
        # statistiche
        valori = np.array(riga[9:]).astype(np.float)
        media = np.mean(valori)
        mediana = np.median(valori)
        maxim = np.max(valori)
        minim = np.min(valori)
        std_dev = np.std(valori)
        rms = np.sqrt(np.mean(np.square(valori)))
        quantile = np.quantile(valori, 0.4)
        i_q_r = iqr(valori)
        simmetria = skew(valori)
        curtosi = kurtosis(valori)
        rang = maxim - minim
        
        X_Super.append([stream_name, rang, maxim, std_dev, rms, media, minim, quantile, mediana, curtosi, simmetria, i_q_r])
        features = [rang, maxim, std_dev, rms, media, minim, quantile, mediana, curtosi, simmetria, i_q_r] 
        X['statistics'].append(features)

X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X_Super, y, test_size = 0.3, random_state = 100)

In [13]:
clf = EnsembleFramework()
y_pred = clf.run(X_train, y_train, X_test)

In [14]:
clf.accuracy(y_pred, y_test)

79.43485086342228