In [1]:
from saxpy.sax import sax_via_window

In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from collections import defaultdict
import scipy.stats as stats

In [3]:
from heapq import nlargest
from operator import itemgetter
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split

import time
import math

In [4]:
# il metodo sax_via_window della libreria saxpy restituisce un dict con key = parola e value = posizione/i della parola
# questo metodo estrae dal dict le parole e le inserisce in un array n volte, con n la lunghezza del value
def extract_sax_words(dictionary, filter_words = None):
    
    words = []
    for key, value in dictionary.items():
        number = len(value)
        if (filter_words):
            if (key in dict(filter_words)):
                for i in range(0, number):
                    words.append(key)
        else:
            for i in range(0, number):
                words.append(key)
    
    return words

In [5]:
# trasforma le ts in sax representation
def sax(valori_ts, subsequence_length, word_length):
    
    # ts_sax contiene la rappresentazione di ogni time series
    ts_sax = []
    # words contiene tutte le parole
    words = []
    valori_ts = np.array(valori_ts)
    
    for ts in valori_ts:
        sax_words = sax_via_window(ts, subsequence_length, word_length, 4)
        ts_sax.append(extract_sax_words(sax_words))
        words.extend(list(sax_words.keys()))
        
    return ts_sax, words

In [6]:
# trasforma le features del dataset in parole e lo popola con il conto delle parole
def preprocess_data(data, words, ts_sax):
    
    for word in set(words):
        data[word] = 0
    data = data.drop(np.arange(9, 454), axis = 1)
    for ts, row in zip(ts_sax, data.iterrows()):
        counter = Counter(ts)
        for word in counter.keys():
            data.at[row[0], word] = counter[word]
    
    return data

In [7]:
# calcola l'anova f-score per ogni feature (parola)
def compute_anova(data, words):
    anova_values = []
    class_features = []

    for word in set(words):
        class_features = []
        for i in range(0, 11):
            class_features.append(data[word][data[8] == i])
        anova_values.append((word, stats.f_oneway(*class_features)[0]))
    
    return anova_values

In [8]:
# distanza di una misurazione dal centroide, word_set è l'insieme delle features da tenere in considerazione
def centroid_distance(centroids, data, word_set):
    distance = 0.0
    
    for centroid, word in zip(centroids, word_set):
        distance += (data[word] - centroid)**2
    
    return distance

In [9]:
# calcolo del term frequency–inverse document frequency
def tfidf(word, classe, data):
    return tf(word, classe, data)*idf(word, data)

In [10]:
# calcolo del term-frequency
def tf(word, classe, data):
    ct = sum(data[word][data['classe'] == classe])
    if (ct == 0):
        return 0
    else:
        return 1+math.log(ct)

In [11]:
# calcolo dell'inverse document frequency
def idf(word, data):
    c = len(set(data['classe']))
    wc = 0
    for i in range(0, c):
        if (any(data[word][data['classe'] == i] > 0)):
            wc += 1
    
    return wc

In [12]:
# misura di distance di una misurazione dal tf-idf, word_set è l'insieme delle features da tenere in considerazione
def squared_cosine_similarity(tfidfs, data, word_set):
    num = 0.0
    den1 = 0.0
    den2 = 0.0
    
    for tfidf, word in zip(tfidfs, word_set):
        num += (data[word]*tfidf)**2
        den1 += data[word]
        den2 += tfidf
    
    distance = num/(den1*den2)
    return distance

In [13]:
# cross-validation per il calcolo dei centroidi per ogni classe
def cv_centroids(data, anova, max_features, w, l):
    
    # trasformo i dati in array, dato che LeaveOneOut non accetta come argomento un DataFrame di Pandas
    X = np.array(data.drop(8, axis = 1))
    y = np.array(data[8])

    loo = LeaveOneOut()
    best_accuracy = 0.00
    
    # eseguo la cross-validation aumentando ad ogni ciclo il numero di feature da prendere in considerazione
    for k in range(1, max_features):
        
        matches = 0
        
        # cross-validation
        for train_index, test_index in loo.split(X):
            X_train_CV, X_test_CV = X[train_index], X[test_index]
            y_train_CV, y_test_CV = y[train_index], y[test_index]
            
            # ritrasfromo in DataFrame, per facilità di manipolazione
            X_train_CV = pd.DataFrame(X_train_CV, columns= [column for column in data.drop(8, axis = 1)])
            X_test_CV = pd.DataFrame(X_test_CV, columns= [column for column in data.drop(8, axis = 1)])
            X_train_CV['classe'] = y_train_CV
            
            # prendo le top k features per anova value
            topk = nlargest(k, anova, key=itemgetter(1))
            words = [word[0] for word in topk]

            class_centroids = []
            c_distance = float('Inf')
            classe = 0
            
            # calcolo i centroidi per ogni classe
            for i in range(0, 11):
                centroids = []
                for word in set(words):
                    ct = sum(X_train_CV[word][X_train_CV['classe'] == i])
                    stc = (X_train_CV['classe'] == i).value_counts()[1]
                    centroids.append(ct/stc)
                class_centroids.append(centroids)
                # distanza di X_test dai centroidi calcolati
                distance = centroid_distance(centroids, X_test_CV, words)[0]
                # se la distanza è minore della distanza più piccola incontrata in precedenza, a X_test viene assegnata la classe i
                if (distance < c_distance):
                    c_distance = distance
                    classe = i
            if (classe == y_test_CV):
                matches += 1
        
        # calcolo accuracy 
        accuracy = matches/(loo.get_n_splits(X))*100
        if (accuracy > best_accuracy):
            best_k = k
            best_accuracy = accuracy
            best_features = words
            best_centroids = class_centroids
        
        tipo = 'C'
    
    return best_k, best_accuracy, best_features, best_centroids, w, l, tipo

In [14]:
# cross-validation per il calcolo dei tf-idf per ogni classe
def cv_tfidf(data, anova, max_features, w, l):
    X = np.array(data.drop(8, axis = 1))
    y = np.array(data[8])

    loo = LeaveOneOut()
    best_accuracy = 0.00

    for k in range(1, max_features):
        
        matches = 0
        
        for train_index, test_index in loo.split(X):
            X_train_CV, X_test_CV = X[train_index], X[test_index]
            y_train_CV, y_test_CV = y[train_index], y[test_index]
            X_train_CV = pd.DataFrame(X_train_CV, columns= [column for column in data.drop(8, axis = 1)])
            X_test_CV = pd.DataFrame(X_test_CV, columns= [column for column in data.drop(8, axis = 1)])
            X_train_CV['classe'] = y_train_CV

            topk = nlargest(k, anova, key=itemgetter(1))
            words = [word[0] for word in topk]

            class_tfidf = []
            c_distance = float('Inf')
            classe = 0

            for i in range(0, 11):
                tfidfs = []
                for word in set(words):
                    tfidfs.append(tfidf(word, i, X_train_CV))
                class_tfidf.append(tfidfs)
                distance = squared_cosine_similarity(tfidfs, X_test_CV, words)[0]
                if (distance < c_distance):
                    c_distance = distance
                    classe = i
            if (classe == y_test_CV):
                matches += 1

        accuracy = matches/(loo.get_n_splits(X))*100
        if (accuracy > best_accuracy):
            best_k = k
            best_accuracy = accuracy
            best_features = words
            best_tfidf = class_tfidf
            
    tipo = 'T'
    
    return best_k, best_accuracy, best_features, best_tfidf, w, l, tipo

In [15]:
# estraiamo dai selettori restituitici dalle cross-validation i più performanti
def selection(co1, co2):
    co = []

    top2_c = nlargest(2, co1, key=itemgetter(1))
    top2_t = nlargest(2, co2, key=itemgetter(1))
    sum1 = 0
    sum2 = 0

    for co1, co2 in zip(top2_c, top2_t):
        sum1 += co1[1]
        sum2 += co2[1]

    aaco1 = sum1/2
    aaco2 = sum2/2
    if (aaco1 > 0.7*max(aaco1, aaco2)):
        co.extend(top2_c)
    
    if (aaco2 > 0.7*max(aaco1, aaco2)):
        co.extend(top2_t)
        
    return co

In [16]:
# estraiamo da un selettore le features, i centroidi/tf-idf, la word-length, la window-length, il tipo (centroide o tf-idf)
def extract_co(co):
    F = co[2]
    t = co[3]
    w = co[4]
    l = co[5]
    tipo = co[6]
    
    return F, t, w, l, tipo

In [17]:
# funzione che unisce i due step di rappresentazione sax e trasformazione del dataset
def BOP(ts, w, l):
    ts_sax, words = sax(ts, l, w)
    data = preprocess_data(ts, words, ts_sax)
    
    return data, words

In [18]:
# classifica un dataset
def classify(ts_bop, features, T, tipo):
    Hco = []
    
    for ts in np.array(ts_bop):
        ts = pd.DataFrame(ts.reshape(1,-1), columns= [column for column in ts_bop])
        c_distance = float('Inf')
        
        # calcolo delle distanze dai centroidi/tf-idf di ogni classe
        for i in range(0, 11):
            if (tipo == 'T'):
                distance = squared_cosine_similarity(T[i], ts, features)[0]
            else:
                distance = centroid_distance(T[i], ts, features)[0]
                
            if (distance < c_distance):
                c_distance = distance
                classe = i
                
        Hco.append(classe)
    return Hco

In [19]:
# applicazione del voto di maggioranza alla classificazione di più selettori
def majority_vote(H_co):

    predictions = list(zip(*H_co))
    majority_vote = []
    
    for i in range(0, len(H_co[0])):
        counter = Counter(predictions[i])
        majority_vote.append(counter.most_common(1)[0][0])
    
    return majority_vote

In [20]:
def BOPF_fit(X_train, y_train):
    # array per storare le combinazioni restituite dalle cross-validation
    c_combinations = []
    t_combinations = []

    # array di word-length e window-length da testare
    word_length = [4, 5, 6]
    window_length = np.arange(0.04*X_train.shape[0], 0.2*X_train.shape[0], 0.04*X_train.shape[0]).astype(int)

    for w in word_length:
        for l in window_length:
            print('Word Length: ', w)
            print('Window Length: ', l)
            print('Sax Approximation...')
            data, words = BOP(X_train, w, l)
            print('Computing ANOVA values...')
            anova_values = compute_anova(data, words)
            print('Cross-validating centroids...')
            c_combinations.append(cv_centroids(data, anova_values, 5, w, l))
            print('Cross-validating tfidf...')
            t_combinations.append(cv_tfidf(data, anova_values, 5, w, l))
    
    print('Best predictors selection...')
    co = selection(c_combinations, t_combinations)
    return co

In [21]:
def BOPF_predict(co, X_test):
    H_co = []
    # per ogni combinazione/selettore, classifico il dataset di test
    for c in co:
        F, T, w, l, tipo = extract_co(c)
        T_BOP, words = BOP(X_test, w, l)
        H_co.append(classify(T_BOP, F, T, tipo))
    
    y_pred = majority_vote(H_co)
    return y_pred

In [20]:
# leggo il dataset

data = pd.read_csv('Swissex.meta.csv', header = None, sep = ',')
data = data.drop(np.arange(0,8), axis=1)

# split del dataset
X_train, X_test, y_train, y_test = train_test_split(data.drop(8, axis = 1), data[8], test_size = 0.3, random_state = 100)
X_train[8] = y_train

# array per storare le combinazioni restituite dalle cross-validation
c_combinations = []
t_combinations = []

# array di word-length e window-length da testare
word_length = [4, 5, 6]
window_length = np.arange(0.025*len(data.shape[0]), 0.18*len(data.shape[0]), 0.04*len(data.shape[0])).astype(int)

for w in word_length:
    for l in window_length:
        print('Word Length: ', w)
        print('Window Length: ', l)
        print('Sax Approximation...')
        data, words = BOP(X_train, w, l)
        print('Computing ANOVA values...')
        anova_values = compute_anova(data, words)
        print('Cross-validating centroids...')
        c_combinations.append(cv_centroids(data, anova_values, 5, w, l))
        print('Cross-validating tfidf...')
        t_combinations.append(cv_tfidf(data, anova_values, 5, w, l))

Word Length:  4
Window Length:  11
Sax Approximation...
Computing ANOVA values...
Cross-validating centroids...
Cross-validating tfidf...
Word Length:  4
Window Length:  28
Sax Approximation...
Computing ANOVA values...
Cross-validating centroids...
Cross-validating tfidf...
Word Length:  4
Window Length:  46
Sax Approximation...
Computing ANOVA values...
Cross-validating centroids...
Cross-validating tfidf...
Word Length:  4
Window Length:  64
Sax Approximation...
Computing ANOVA values...
Cross-validating centroids...
Cross-validating tfidf...
Word Length:  5
Window Length:  11
Sax Approximation...
Computing ANOVA values...
Cross-validating centroids...
Cross-validating tfidf...
Word Length:  5
Window Length:  28
Sax Approximation...
Computing ANOVA values...
Cross-validating centroids...
Cross-validating tfidf...
Word Length:  5
Window Length:  46
Sax Approximation...
Computing ANOVA values...
Cross-validating centroids...
Cross-validating tfidf...
Word Length:  5
Window Length:  64

In [98]:
print('Best predictors selection...')
co = selection(c_combinations, t_combinations)
H_co = []
# per ogni combinazione/selettore, classifico il dataset di test
for c in co:
    F, T, w, l, tipo = extract_co(c)
    T_BOP, words = BOP(X_test, w, l)
    H_co.append(classify(T_BOP, F, T, tipo))

y_pred = majority_vote(H_co)

Best predictors selection...


In [143]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)*100

30.76923076923077

In [149]:
from sklearn.metrics import f1_score

f1_score(y_pred, y_test, average = 'macro')*100

25.068295163988946