In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import tree
from scipy.stats import iqr
from scipy.stats import skew
from scipy.stats import kurtosis
from sklearn.model_selection import GridSearchCV

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier

In [19]:
from ipynb.fs.defs.BOS_Wrapper import BOS_Classifier
from ipynb.fs.defs.NLP_Classifier import NLP_Classifier
from ipynb.fs.defs.TSC_1NN import TSC_1NN, DTWDistance
from ipynb.fs.defs.TSC_Wrapper import TSC_Classifier

In [20]:
from heapq import nlargest

In [21]:
class EnsembleFramework:
    
    def __init__(self, criterion = 'topk', layers = [{'type' : 'NLP'}, {'type' : 'BOS', 'name' : 'DecisionTreeClassifier()'}], params = {'k' : [4, 1]}):
        self.criterion = criterion
        self.layers = layers
        self.params = params
        
    def survival(self, probs, classes):
        survived = []
        survived_probs = []

        for c in classes:
            survived_probs.append(probs[c])

        media = np.mean(survived_probs)
        std_dev = np.std(survived_probs)

        for c in classes:
            if (probs[c] >= (media + std_dev)):
                survived.append(c)
        
        if (len(survived) == 0):
            survived = nlargest(1, classes, key = lambda x : probs[x])
            
        return survived
        
    def run(self, X_train, y_train, X_test):
        classes = [list(set(y_train))] * len(X_test)
        self.classifiers = []
        for classifier in self.layers:
            if (classifier['type'] == 'BOS'):
                clf = BOS_Classifier(eval(classifier['name']))
            elif (classifier['type'] == 'NLP'):
                clf = NLP_Classifier()
            elif (classifier['type'] == 'TSC'):
                clf = TSC_Classifier(eval(classifier['name']))
            clf.fit(X_train, y_train)
            self.classifiers.append(clf)
            
        if (self.criterion == 'topk'):
            for clf, k in zip(self.classifiers, self.params['k']):
                probs = clf.predict_proba(X_test)
                for i, series_probs in enumerate(probs):
                    classes[i] = nlargest(k, classes[i], key = lambda x : series_probs[x])
                    
        elif (self.criterion == 'sof'):
            for j, clf in enumerate(self.classifiers):
                probs = clf.predict_proba(X_test)
                for i, series_probs in enumerate(probs):
                    classes[i] = self.survival(series_probs, classes[i])
                    if (len(classes[i]) == 0):
                        print('Lunghezza 0')
                    if (j == (len(self.classifiers)-1)):
                        classes[i] = nlargest(1, classes[i], key = lambda x : series_probs[x])
                
        return classes  
    
    def accuracy(self, y_pred, y_test):
        classes = []
        for pred in y_pred:
            classes.append(*pred)
            
        return accuracy_score(classes, y_test)*100

In [37]:
y = []
X = {'metadata': [], 'statistics' : [], 'timeseries' : []}

path = ""

# calcoliamo le features di ogni timeseries

with open(path + 'ThingspeakEU.meta.csv', 'r', encoding='utf-8') as dati:
    for row in dati:
        riga = row.strip().split(',')
        
        classe = int(riga[8])
        y.append(classe)
        
        valori = np.array(riga[9:]).astype(np.float)
        X['timeseries'].append(valori)
        
        # metadati
        stream_name = riga[1]
        X['metadata'].append(stream_name)
        
        # statistiche
        valori = np.array(riga[9:]).astype(np.float)
        media = np.mean(valori)
        mediana = np.median(valori)
        maxim = np.max(valori)
        minim = np.min(valori)
        std_dev = np.std(valori)
        rms = np.sqrt(np.mean(np.square(valori)))
        quantile = np.quantile(valori, 0.4)
        i_q_r = iqr(valori)
        simmetria = skew(valori)
        curtosi = kurtosis(valori)
        rang = maxim - minim
        
        features = [rang, maxim, std_dev, rms, media, minim, quantile, mediana, curtosi, simmetria, i_q_r] 
        X['statistics'].append(features)

X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [38]:
clf = EnsembleFramework(criterion = 'sof', layers = [{'type' : 'NLP'},  {'type' : 'BOS', 'name' : 'RandomForestClassifier(100)'}])
y_pred = clf.run(X_train, y_train, X_test)

In [39]:
clf.accuracy(y_pred, y_test)

85.24332810047096

In [41]:
f1_score(y_pred, y_test, average = 'micro')

0.8524332810047096

In [30]:
nn = TSC_Classifier(eval('TSC_1NN(15, metric = \'euclidean\')'))

In [18]:
nn.fit(X_train, y_train)
nn.predict_proba(X_test)[85]

NameError: name 'nn' is not defined

In [7]:
kk = RandomForestClassifier(100)
kk.fit(list(X_train['statistics']), y_train)
probs = kk.predict_proba(list(X_test['statistics']))

In [29]:
classes = [list(set(y_train))] * len(X_test)
classes[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20]
for i, series_probs in enumerate(probs):
    classes[i] = survival(series_probs, classes[i])

In [30]:
classes[0]

[7, 20]

In [16]:
probs[0]

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.02, 0.24, 0.  , 0.  , 0.02,
       0.  , 0.  , 0.  , 0.  , 0.04, 0.  , 0.  , 0.  , 0.  , 0.68])