In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import tree
from scipy.stats import iqr
from scipy.stats import skew
from scipy.stats import kurtosis
from sklearn.model_selection import GridSearchCV

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
from ipynb.fs.defs.BOS_Wrapper import BOS_Classifier
from ipynb.fs.defs.NLP_Classifier import NLP_Classifier
from ipynb.fs.defs.TSC_1NN import TSC_1NN, DTWDistance
from ipynb.fs.defs.TSC_Wrapper import TSC_Classifier

In [4]:
from heapq import nlargest

In [5]:
class EnsembleFramework:
    
    def __init__(self, criterion = 'topk', tuning = False, layers = [{'type' : 'NLP'}, {'type' : 'BOS', 'name' : 'DecisionTreeClassifier()'}], params = {'k' : [4, 1]}):
        self.criterion = criterion
        self.layers = layers
        self.params = params
        self.tuning = tuning
        
    def survival(self, probs, classes):
        survived = []
        survived_probs = []

        for c in classes:
            survived_probs.append(probs[c])

        media = np.mean(survived_probs)
        std_dev = np.std(survived_probs)

        for c in classes:
            if (probs[c] >= (media + std_dev)):
                survived.append(c)
        
        if (len(survived) == 0):
            survived = nlargest(1, classes, key = lambda x : probs[x])
            
        return survived
        
    def run(self, X_train, y_train, X_test):
        classes = [list(set(y_train))] * len(X_test)
        self.classifiers = []
        for classifier in self.layers:
            if (classifier['type'] == 'BOS'):
                clf = BOS_Classifier(eval(classifier['name']))
            elif (classifier['type'] == 'NLP'):
                clf = NLP_Classifier()
            elif (classifier['type'] == 'TSC'):
                clf = TSC_Classifier(eval(classifier['name']))
            clf.fit(X_train, y_train)
            self.classifiers.append(clf)
            
        if (self.criterion == 'topk'):
            for clf, k in zip(self.classifiers, self.params['k']):
                probs = clf.predict_proba(X_test)
                if (self.tuning):
                    X_test = self.ftuning(X_test, probs)
                for i, series_probs in enumerate(probs):
                    classes[i] = nlargest(k, classes[i], key = lambda x : series_probs[x])
                    
        elif (self.criterion == 'sof'):
            for j, clf in enumerate(self.classifiers):
                probs = clf.predict_proba(X_test)
                if (self.tuning):
                    X_test = self.ftuning(X_test, probs)
                for i, series_probs in enumerate(probs):
                    classes[i] = self.survival(series_probs, classes[i])
                    if (len(classes[i]) == 0):
                        print('Lunghezza 0')
                    if (j == (len(self.classifiers)-1)):
                        classes[i] = nlargest(1, classes[i], key = lambda x : series_probs[x])
                        
        elif (self.criterion == 'tuning'):
            for j, clf in enumerate(self.classifiers):
                probs = clf.predict_proba(X_test)
                X_test = self.ftuning(X_test, probs)
                if (j == (len(self.classifiers)-1)):
                    for i, series_probs in enumerate(probs):
                        classes[i] = nlargest(1, classes[i], key = lambda x : series_probs[x])
                
        return classes  
    
    def ftuning(self, X_test, probs):
        for prob_list, (_, row) in zip(probs, X_test.iterrows()):
            row['statistics'] = row['statistics'][:-len(prob_list)]
            row['statistics'] = np.append(row['statistics'], prob_list) 
            
        return X_test
    
    def accuracy(self, y_pred, y_test):
        classes = []
        for pred in y_pred:
            classes.append(*pred)
            
        return accuracy_score(classes, y_test)*100

In [25]:
y = []
X = {'metadata': [], 'statistics' : [], 'timeseries' : []}

path = ""

# calcoliamo le features di ogni timeseries

with open(path + 'ThingspeakEU.meta.csv', 'r', encoding='utf-8') as dati:
    for row in dati:
        riga = row.strip().split(',')
        
        classe = int(riga[8])
        y.append(classe)
        
        valori = np.array(riga[9:]).astype(np.float)
        X['timeseries'].append(valori)
        
        # metadati
        stream_name = riga[1]
        X['metadata'].append(stream_name)
        
        # statistiche
        valori = np.array(riga[9:]).astype(np.float)
        media = np.mean(valori)
        mediana = np.median(valori)
        maxim = np.max(valori)
        minim = np.min(valori)
        std_dev = np.std(valori)
        rms = np.sqrt(np.mean(np.square(valori)))
        quantile = np.quantile(valori, 0.4)
        i_q_r = iqr(valori)
        simmetria = skew(valori)
        curtosi = kurtosis(valori)
        rang = maxim - minim
        
        features = [rang, maxim, std_dev, rms, media, minim, quantile, mediana, curtosi, simmetria, i_q_r] 
        X['statistics'].append(features)
    
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

#for (i, r), c in zip(X_train.iterrows(), y_train):
    #class_prob = np.zeros(len(set(y)))
    #class_prob[c] = 1
    #r['statistics'] = np.concatenate((r['statistics'], class_prob))
    
#class_prob = np.full(len(set(y)), 0.5)

#for i, r in X_test.iterrows():
    #r['statistics'] = np.concatenate((r['statistics'], class_prob))

In [149]:
clf = EnsembleFramework(criterion = 'tuning', tuning = True, layers = [{'type' : 'NLP'},  {'type' : 'BOS', 'name' : 'KNeighborsClassifier(100)'}])
y_pred = clf.run(X_train, y_train, X_test)

In [150]:
clf.accuracy(y_pred, y_test)

64.67817896389325

In [151]:
f1_score(y_pred, y_test, average = 'micro')

0.6467817896389325

In [None]:
# test feature tuning

In [26]:
knn = RandomForestClassifier(100)
knn.fit(list(X_train['statistics']), y_train)
y_pred = knn.predict(list(X_test['statistics']))
accuracy_score(y_pred, y_test)

0.7551020408163265

In [27]:
list(X_test['statistics'])

[[1796.83,
  3643.33,
  519.8203889345241,
  2491.951641174042,
  2437.1314583333333,
  1846.5,
  2069.33,
  2160.5,
  -1.4131927030607871,
  0.305314100430673,
  991.165],
 [403.22,
  410.72,
  99.40388436742752,
  216.18578296754777,
  191.97697916666667,
  7.5,
  160.41,
  202.5,
  -0.7566829224597829,
  0.007820716133343232,
  166.875],
 [6.42999999999995,
  1005.0,
  2.182940804409399,
  1001.6265454156304,
  1001.6241666666668,
  998.57,
  1001.0,
  1002.0,
  -1.477524154447799,
  -0.02272882000039023,
  4.0],
 [1205.5900000000001,
  1186.43,
  323.62138614880394,
  400.39484890958994,
  235.76520833333333,
  -19.16,
  3.77,
  40.015,
  0.34535122484087655,
  1.201150313049384,
  478.74],
 [3.3299999999999272,
  1013.3,
  0.8933962003687524,
  1011.7416444470066,
  1011.74125,
  1009.97,
  1011.73,
  1011.87,
  -0.5599094567392955,
  -0.25177914910297017,
  1.2100000000000364],
 [13.62,
  23.18,
  4.031658393396155,
  14.844832995580202,
  14.286875,
  9.56,
  12.54,
  13.15,
  -

In [28]:
for (i, r), c in zip(X_train.iterrows(), y_train):
    class_prob = np.zeros(len(set(y)))
    class_prob[c] = 1
    r['statistics'] = np.concatenate((r['statistics'], class_prob))

class_prob = np.full(len(set(y)), 0.5)

for i, r in X_test.iterrows():
    r['statistics'] = np.concatenate((r['statistics'], class_prob))

In [29]:
list(X_test['statistics'])

[array([ 1.79683000e+03,  3.64333000e+03,  5.19820389e+02,  2.49195164e+03,
         2.43713146e+03,  1.84650000e+03,  2.06933000e+03,  2.16050000e+03,
        -1.41319270e+00,  3.05314100e-01,  9.91165000e+02,  5.00000000e-01,
         5.00000000e-01,  5.00000000e-01,  5.00000000e-01,  5.00000000e-01,
         5.00000000e-01,  5.00000000e-01,  5.00000000e-01,  5.00000000e-01,
         5.00000000e-01,  5.00000000e-01,  5.00000000e-01,  5.00000000e-01,
         5.00000000e-01,  5.00000000e-01,  5.00000000e-01,  5.00000000e-01,
         5.00000000e-01,  5.00000000e-01,  5.00000000e-01,  5.00000000e-01]),
 array([ 4.03220000e+02,  4.10720000e+02,  9.94038844e+01,  2.16185783e+02,
         1.91976979e+02,  7.50000000e+00,  1.60410000e+02,  2.02500000e+02,
        -7.56682922e-01,  7.82071613e-03,  1.66875000e+02,  5.00000000e-01,
         5.00000000e-01,  5.00000000e-01,  5.00000000e-01,  5.00000000e-01,
         5.00000000e-01,  5.00000000e-01,  5.00000000e-01,  5.00000000e-01,
         5

In [31]:
knn = RandomForestClassifier(100)
knn.fit(list(X_train['statistics']), y_train)
y_pred = knn.predict(list(X_test['statistics']))
accuracy_score(y_pred, y_test)

0.33437990580847726

In [32]:
nlp = NLP_Classifier()
nlp.fit(X_train, y_train)
probs = nlp.predict_proba(X_test)

for prob_list, (_, row) in zip(probs, X_test.iterrows()):
    row['statistics'] = row['statistics'][:-len(prob_list)]
    row['statistics'] = np.append(row['statistics'], prob_list) 

In [33]:
list(X_test['statistics'])

[array([ 1.79683000e+03,  3.64333000e+03,  5.19820389e+02,  2.49195164e+03,
         2.43713146e+03,  1.84650000e+03,  2.06933000e+03,  2.16050000e+03,
        -1.41319270e+00,  3.05314100e-01,  9.91165000e+02,  1.25000000e-01,
         1.25000000e-01,  1.25000000e-01,  1.25000000e-01,  1.25000000e-01,
         1.42857143e-01,  1.11111111e-01,  1.25000000e-01,  1.11111111e-01,
         1.11111111e-01,  1.25000000e-01,  1.25000000e-01,  1.11111111e-01,
         1.25000000e-01,  1.11111111e-01,  1.25000000e-01,  1.25000000e-01,
         1.11111111e-01,  1.25000000e-01,  1.25000000e-01,  2.00000000e-01]),
 array([ 4.03220000e+02,  4.10720000e+02,  9.94038844e+01,  2.16185783e+02,
         1.91976979e+02,  7.50000000e+00,  1.60410000e+02,  2.02500000e+02,
        -7.56682922e-01,  7.82071613e-03,  1.66875000e+02,  9.09090909e-02,
         9.09090909e-02,  9.09090909e-02,  1.00000000e-01,  8.33333333e-02,
         1.11111111e-01,  8.33333333e-02,  8.33333333e-02,  1.00000000e+00,
         1

In [34]:
knn = RandomForestClassifier(100)
knn.fit(list(X_train['statistics']), y_train)
y_pred = knn.predict(list(X_test['statistics']))
accuracy_score(y_pred, y_test)

0.5965463108320251

In [None]:
# test vari

In [103]:
nn = TSC_Classifier(eval('TSC_1NN(1, metric = \'euclidean\')'))

In [104]:
nn.fit(X_train, y_train)
nn.predict_proba(X_test)[85]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.])

In [7]:
kk = RandomForestClassifier(100)
kk.fit(list(X_train['statistics']), y_train)
probs = kk.predict_proba(list(X_test['statistics']))

In [29]:
classes = [list(set(y_train))] * len(X_test)
classes[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20]
for i, series_probs in enumerate(probs):
    classes[i] = survival(series_probs, classes[i])

In [30]:
classes[0]

[7, 20]

In [16]:
probs[0]

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.02, 0.24, 0.  , 0.  , 0.02,
       0.  , 0.  , 0.  , 0.  , 0.04, 0.  , 0.  , 0.  , 0.  , 0.68])