In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

!pip install tslearn
from tslearn.clustering import TimeSeriesKMeans
from tslearn.metrics import cdist_dtw, dtw
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.piecewise import PiecewiseAggregateApproximation

from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from get_time_series import get_fingerprints, get_interactions, load_data, load_data_posts, load_data_users

In [None]:
year_data = 'Twitter16'
dest = 'tw16'

t = os.path.join('gdrive/My Drive/Master/Data', year_data, 'tree.tar.gz')
p = os.path.join('gdrive/My Drive/Master/Data', year_data, 'post.tar.gz')
u = os.path.join('gdrive/My Drive/Master/Data', year_data, 'users.tar.gz')
l = os.path.join('gdrive/My Drive/Master/Data', year_data, 'label.txt')

!mkdir "$dest"
!tar -xzvf "$t" -C "$dest"
!tar -xzvf "$p" -C "$dest"
!tar -xzvf "$u" -C "$dest"
!cp "$l" "$dest"

# Representation



In [None]:
class data_process:
    def __init__(self, path = './'):
        self.news = get_time_series.load_data(path = path)
        self.posts = get_time_series.load_data_posts(path = path)
        self.users = get_time_series.load_data_users(path = path)

In [None]:
%%time
dest = 'tw16'
data = data_process(dest)

CPU times: user 51.5 s, sys: 8.23 s, total: 59.7 s
Wall time: 1min


In [None]:
%%time
window = 10
sliding_window = True 
time = 720
labels, followers = get_fingerprints(data.news, data.users, data.posts, 'followers', window, sliding_window, time)
labels, following = get_fingerprints(data.news, data.users, data.posts, 'followings', window, sliding_window, time)
labels, new_users = get_fingerprints(data.news, data.users, data.posts, 'new_users', window, sliding_window, time)
print(labels.shape)
print(followers.shape)
print(following.shape)
print(new_users.shape)

(818,)
(818, 720)
CPU times: user 27min 15s, sys: 422 ms, total: 27min 15s
Wall time: 27min 17s


In [None]:
%%time
labels, retweets, posts = get_interactions(data.news, 10, True, 720)
interactions = retweets + posts
# grad_interactions_1 = np.gradient(interactions, axis=1)
# grad_interactions_2 = np.gradient(grad_interactions_1, axis=1)
print(labels.shape)
print(retweets.shape)
print(posts.shape)

(818,)
(818, 720)
(818, 720)
CPU times: user 27min 32s, sys: 408 ms, total: 27min 33s
Wall time: 27min 36s


In [None]:
def get_multivariate_series(time_series, labels, scale = 1): 
        paa = PiecewiseAggregateApproximation(n_segments=140) # equals the number of segments and samples!!!
        digits = []
        for i in range(72):
            digits.append(i*10)
        digits = np.array(digits)
        
        if scale == 0:
            time_series_scaled = []
            for ts in time_series:
                scaler = MinMaxScaler()
                # scaler = StandardScaler()
                scaler.fit(ts)
                ts = scaler.transform(ts)
                ts = ts.reshape(ts.shape[0],ts.shape[1])
                time_series_scaled.append(ts)
        elif scale == 1:
            time_series_scaled = []
            for ts in time_series:
                scaler = MinMaxScaler()
                # scaler = StandardScaler()
                scaler.fit(ts)
                ts = scaler.transform(ts)
                # print('1 ', ts.shape)
                ts = ts.reshape(ts.shape[0], ts.shape[1], 1)
                ts = paa.inverse_transform(paa.fit_transform(ts))
                # print(ts.shape)
                ts = ts[:,digits,:]
                # print(ts.shape)
                ts = ts.reshape(ts.shape[0],ts.shape[1])
                # print(ts.shape)
                time_series_scaled.append(ts) 
        elif scale == 2:  
            time_series_scaled = []
            for ts in time_series:  
                time_series_scaled.append(ts)
        time_series_multi = np.stack(time_series_scaled, axis=-1) # -1
        return time_series_multi

In [None]:
time_series = get_multivariate_series(time_series=[retweets, posts, interactions, followers, following, new_users], 
                                      labels=labels, scale=1)
x_train, x_test, y_train, y_test = train_test_split(time_series, labels, test_size=0.2, stratify=labels)
print(time_series.shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(818, 72, 5)
(654, 72, 5)
(654,)
(164, 72, 5)
(164,)


# Clustering

In [None]:
def confusion_matrix(y_train, y_clus, n_prototypes):
    n_classes = np.unique(y_train).shape[0]
    conf = np.zeros((n_classes,n_prototypes), dtype=int)
    n_instances = y_clus.shape[0]
    for k in range(n_instances):
        i = y_train[k]
        j = y_clus[k]
        conf[i][j] += 1 
    return conf

def gini_coefficient(conf):
    n_classes = conf.shape[0]
    n_prototypes = conf.shape[1]
    gini = np.zeros(n_prototypes, dtype=float)
    for j in range(n_prototypes):
        acc = float(conf[:,j].sum())
        if acc > 0:
            for i in range(n_classes):
                ratio = float(conf[i,j])/acc #la proporción del cluster que fue asignado a esa clase
                gini[j] += pow(ratio, 2)
            gini[j] = round(1 - gini[j], 3)
        else:
            gini[j] = np.nan
    return gini

def k_clustering(x_train, fe):
    print('start')
    n_prototypes = 20
    n_instances = x_train.shape[0]
    km_dba = TimeSeriesKMeans(n_clusters=n_prototypes, metric="dtw", init="k-means++")
    y_clus = km_dba.fit_predict(x_train[:,:,fe]) #serie de tiempo univariada, la fe-esima
    return km_dba, y_clus

In [None]:
%%time 
#Obtener clustering para cada huella temporal con k=20
km_dba_0, y_clus_0 = k_clustering(x_train,0)
km_dba_1, y_clus_1 = k_clustering(x_train,1) 
km_dba_2, y_clus_2 = k_clustering(x_train,2) 
km_dba_3, y_clus_3 = k_clustering(x_train,3) 
km_dba_4, y_clus_4 = k_clustering(x_train,4) 
# km_dba_5, y_clus_5 = k_clustering(x_train,5) 
# km_dba_6, y_clus_6 = k_clustering(x_train,6) 

start
start
start
start
start
CPU times: user 5min 42s, sys: 269 ms, total: 5min 42s
Wall time: 5min 42s


In [None]:
#Obtener matriz de confusión por cada clustering de cada huella temporal
conf_0 = confusion_matrix(y_train,y_clus_0,20)
conf_1 = confusion_matrix(y_train,y_clus_1,20)
conf_2 = confusion_matrix(y_train,y_clus_2,20)
conf_3 = confusion_matrix(y_train,y_clus_3,20)
conf_4 = confusion_matrix(y_train,y_clus_4,20)
# conf_5 = confusion_matrix(y_train,y_clus_5,20)
# conf_6 = confusion_matrix(y_train,y_clus_6,20)

In [None]:
def matprint(mat, fmt="g"): #print confusion matrix
    col_maxes = [max([len(("{:"+fmt+"}").format(x)) for x in col]) for col in mat.T] 
    # print(col_maxes)
    for x in mat:
        # print(x)
        for i, y in enumerate(x):
            print(("{:"+str(col_maxes[i])+fmt+"}").format(y), end="  ")
        print("")

In [None]:
#Obtener índice de gini por cada clustering de cada huella temporal
gini_0 = gini_coefficient(conf_0)
gini_1 = gini_coefficient(conf_1)
gini_2 = gini_coefficient(conf_2)
gini_3 = gini_coefficient(conf_3)
gini_4 = gini_coefficient(conf_4)
# gini_5 = gini_coefficient(conf_5)
# gini_6 = gini_coefficient(conf_6)
print(sum(gini_0)/20)
print(sum(gini_1)/20)
print(sum(gini_2)/20)
print(sum(gini_3)/20)
print(sum(gini_4)/20)
# print(sum(gini_5)/20)
# print(sum(gini_6)/20)

0.44610000000000005
0.41695000000000004
0.22965
0.20665000000000006
0.4760000000000001


In [None]:
def to_hmetis(partitions,ginis):
    n_hyperedges = 0
    n_instances = partitions[0].shape[0]
    fh = open("tw16_k20_5st.hgr",'w')
    k = 0
    for partition in partitions:
        n_prototypes = np.unique(partition).shape[0]
        gini = ginis[k]    
        for i in range(n_prototypes): 
            gin = gini[i]   #inidice de gini del cluster actual
            if not np.isnan(gin):
                w = int(100*(1 - gin)) # check if an int is needed
                digits = np.where(partition == i)[0] #ids de las series del cluster actual
                csize = len(digits)
                if csize > 2:
                    line = str(w) + ' ' #peso
                    for j in range(csize):
                        line += str(digits[j]+1) + ' '  #vertices/series involucrados
                    line = line + '\n'
                    fh.write(line)
                    n_hyperedges += 1
                    # print(i, ' ', line)
        k += 1
    fh.close()
    with open("tw16_k20_5st.hgr", 'r+') as fp:
        lines = fp.readlines()     # lines is list of line, each element '...\n'
        one_line = str(n_hyperedges) + ' ' + str(n_instances) + ' 1\n'
        lines.insert(0, one_line)  # you can use any index if you know the line index
        fp.seek(0)                 # file pointer locates at the beginning to write the whole file again
        fp.writelines(lines)
    # fh.write(str(n_hyperedges) + ' ' + str(n_instances) + ' 1\n')
    return n_hyperedges, n_instances

In [None]:
gini_ = np.zeros(4, dtype=float)
to_hmetis([y_clus_0,y_clus_1,y_clus_2,y_clus_3,y_clus_4,y_train], [gini_0,gini_1,gini_2,gini_3,gini_4,gini_])

(57, 654)

In [None]:
!wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/hmetis/hmetis-1.5-linux.tar.gz  
!gunzip hmetis-1.5-linux.tar.gz
!tar -xvf hmetis-1.5-linux.tar

In [None]:
for i in range(1,7):
    t=i*10
    !./hmetis-1.5-linux/khmetis ./tw16_k20_5st.hgr "$t" 5 1000 3 1 3 0

ERROR: ld.so: object '/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4' from LD_PRELOAD cannot be preloaded (wrong ELF class: ELFCLASS64): ignored.
*******************************************************************************
 HMETIS 1.5.3  Copyright 1998, Regents of the University of Minnesota

HyperGraph Information -----------------------------------------------------
 Name: ./tw16_k20_5st.hgr, #Vtxs:   654, #Hedges:    57, #Parts: 10, UBfactor: 1.05
 Options: GFC, Cut-minimization, Always V-cycle

K-way Partitioning... ------------------------------------------------------

 --------------------------------------------------------------------------
  Summary for the 10-way partition:
                Hyperedge Cut:      1472		(minimize)
      Sum of External Degrees:     10102		(minimize)
                  Scaled Cost:  2.64e-02		(minimize)
                   Absorption:     52.27		(maximize)

      Partition Sizes & External Degrees:
	    68[ 787]     68[ 675]     62[1146]     68[1145]