In [1]:
import tensorflow as tf

from tensorflow.keras import layers, losses
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input,LSTM,RepeatVector,Dense,Lambda,TimeDistributed,Masking
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.utils import normalize

import numpy as np
import pandas as pd
import csv
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler,StandardScaler
import glob,os
from pathlib import Path
from scipy import stats

In [2]:
# with open("out.csv",'r') as dest_f:
#     data_iter = csv.reader(dest_f,delimiter = ",")
#     data = [np.array(data,dtype=int) for data in data_iter]
#     # data = [x.reshape(x.shape[0],1,1) for x in data]
# data_array = np.asarray(data, dtype = object)
# data_array
# padded_array2 = tf.keras.preprocessing.sequence.pad_sequences(data_array2,padding="post")

In [3]:
data_array = []
author_id = []
all_files = glob.glob(os.path.join("time_series/", "*.csv"))   
f_array = [] 
for f in all_files:
    # data_iter = csv.reader(f,delimiter = "\n")
    f_array = pd.read_csv(f,dtype=int,header=None).transpose().squeeze().to_numpy()
    data_array.append(f_array)
    author_id.append(Path(f).stem)

data_array = np.array(data_array,dtype = object)

In [4]:
data_array

array([array([  -5729,   10043,     -30,    2700,    -235,   13248,    -898,
                 2922,     -34,   14660,  -63088,   29976,    -293,   30010,
                  -89,   30083,    -215,   63106,    -223,   10846,    -501,
                70183,   -3053,   47669, -101166,  133232,  -73486,  156221,
               -18582,  263618,  -85544,  261401,     -72,  295232, -137120,
               228721,  -30292,  465998,   -2878,  499945,   -3457,  478284,
              -167677,  680937,    -942,  690091,    -653,  688904,   -1081,
               648146,  -77597,  773549,     -13,  773550,    -111,  764613,
                 -128,  763320,    -749,  698794,   -9976,  776413,     -64,
               763443,  -54528,  786600,  -27242,  822113,  -86134,  939816,
                -1142,  872128,    -282,  855002,  -80088, 1019700,   -6988,
              1028810,    -129,  997937,   -1729,  993226,    -596, 1026528,
                 -162, 1026062,    -405, 1003858,  -83755, 1118102,    -250,

In [5]:
padded_array = tf.keras.preprocessing.sequence.pad_sequences(data_array,padding="post")
# padded_array = np.transpose(padded_array)
padded_array

array([[ -5729,  10043,    -30, ...,      0,      0,      0],
       [-66825,  40625,   -799, ...,      0,      0,      0],
       [-61303,  52578,   -472, ...,      0,      0,      0],
       ...,
       [-37978,  29976,  -1810, ...,      0,      0,      0],
       [ -6164,   2544,   -129, ...,      0,      0,      0],
       [ -6854,  21276, -53478, ...,      0,      0,      0]])

In [6]:
# reshape input into [samples, timesteps, features]
padded_array = padded_array.reshape((padded_array.shape[0],padded_array.shape[1],1))
padded_array.shape

(2577, 2754, 1)

In [7]:
padded_array

array([[[ -5729],
        [ 10043],
        [   -30],
        ...,
        [     0],
        [     0],
        [     0]],

       [[-66825],
        [ 40625],
        [  -799],
        ...,
        [     0],
        [     0],
        [     0]],

       [[-61303],
        [ 52578],
        [  -472],
        ...,
        [     0],
        [     0],
        [     0]],

       ...,

       [[-37978],
        [ 29976],
        [ -1810],
        ...,
        [     0],
        [     0],
        [     0]],

       [[ -6164],
        [  2544],
        [  -129],
        ...,
        [     0],
        [     0],
        [     0]],

       [[ -6854],
        [ 21276],
        [-53478],
        ...,
        [     0],
        [     0],
        [     0]]])

In [8]:
scaler = MinMaxScaler()
padded_array = scaler.fit_transform(padded_array.reshape(-1, padded_array.shape[-1])).reshape(padded_array.shape)

In [9]:
masked_value = scaler.transform([[0]])[0][0]
# masked_value = 0
masked_value

0.03405458526639616

In [10]:
# padded_array = np.where(padded_array==scaler.transform([[0]]), 0, padded_array)

In [11]:
# padded_array

In [12]:
timesteps = padded_array.shape[1]  # Length of your sequences
features = padded_array.shape[2]
inter_dim = 64
latent_dim = 16

model = Sequential()
model.add(Masking(mask_value=masked_value, input_shape=(timesteps, features)))
model.add(LSTM(inter_dim, activation='tanh',return_sequences=True))
model.add(LSTM(latent_dim, activation='tanh',return_sequences=False))
model.add(RepeatVector(timesteps))
model.add(LSTM(latent_dim, activation='tanh', return_sequences=True))
model.add(LSTM(inter_dim, activation='tanh', return_sequences=True))
model.add(TimeDistributed(Dense(1,activation="tanh")))

# model.compile(optimizer='nadam', loss='mean_absolute_error',metrics=['accuracy'])
model.compile(optimizer='adam', loss='mse',metrics=['accuracy'])
# model.compile(optimizer='adam', loss=losses.categorical_crossentropy,metrics=['accuracy'])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 2754, 1)           0         
                                                                 
 lstm (LSTM)                 (None, 2754, 64)          16896     
                                                                 
 lstm_1 (LSTM)               (None, 16)                5184      
                                                                 
 repeat_vector (RepeatVector  (None, 2754, 16)         0         
 )                                                               
                                                                 
 lstm_2 (LSTM)               (None, 2754, 16)          2112      
                                                                 
 lstm_3 (LSTM)               (None, 2754, 64)          20736     
                                                        

In [14]:
model.fit(padded_array,padded_array, epochs=50,batch_size=256)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x156556d6490>

In [15]:
model.save("trained_model.h5")

In [16]:
test = padded_array[0]
test = test.reshape((1,test.shape[0],test.shape[1]))
test.shape

(1, 2754, 1)

In [17]:
test

array([[[0.03395077],
        [0.03423658],
        [0.03405404],
        ...,
        [0.03405459],
        [0.03405459],
        [0.03405459]]])

In [18]:
yhat = model.predict(test)
print(yhat.shape)
print(yhat)

(1, 2754, 1)
[[[0.01562083]
  [0.02286443]
  [0.02804401]
  ...
  [0.03507142]
  [0.03507142]
  [0.03507142]]]


In [19]:
latent_model = Model(inputs=model.inputs, outputs=model.layers[2].output)
latent_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_input (InputLayer)  [(None, 2754, 1)]         0         
                                                                 
 masking (Masking)           (None, 2754, 1)           0         
                                                                 
 lstm (LSTM)                 (None, 2754, 64)          16896     
                                                                 
 lstm_1 (LSTM)               (None, 16)                5184      
                                                                 
Total params: 22,080
Trainable params: 22,080
Non-trainable params: 0
_________________________________________________________________


In [20]:
yhat = latent_model.predict(test)
print(yhat.shape)
print(yhat)

(1, 16)
[[ 2.9350293e-03 -8.5568298e-03 -7.4010214e-04 -9.6708806e-03
   5.1512681e-03  8.4525906e-03 -4.1969627e-04 -1.0130926e-02
   1.4658028e-03  5.5410525e-05  5.1214015e-03  9.7210715e-03
  -4.4111586e-03 -3.2358358e-03  3.7592356e-03 -1.5061032e-02]]


In [21]:
yhat = latent_model.predict(padded_array)
print(yhat.shape)
print(yhat)

(2577, 16)
[[ 0.00293503 -0.00855683 -0.0007401  ... -0.00323584  0.00375924
  -0.01506103]
 [ 0.00286568 -0.00853643 -0.00074083 ... -0.00325116  0.00379929
  -0.01500872]
 [ 0.0029714  -0.00857151 -0.00080289 ... -0.00317949  0.00373571
  -0.01519917]
 ...
 [ 0.0030034  -0.00857481 -0.00077084 ... -0.0032154   0.00372324
  -0.01513969]
 [ 0.00291408 -0.00855841 -0.00080769 ... -0.00316496  0.00376167
  -0.01520649]
 [ 0.00298484 -0.00856682 -0.00074593 ... -0.00324813  0.00373867
  -0.01506498]]


In [22]:
yhat[98]

array([ 3.0430050e-03, -8.5793594e-03, -7.5265515e-04, -9.6721388e-03,
        5.0866036e-03,  8.5179945e-03, -4.1302847e-04, -1.0149174e-02,
        1.4771015e-03,  1.2915596e-05,  5.1298062e-03,  9.7880745e-03,
       -4.4426839e-03, -3.2387360e-03,  3.7060587e-03, -1.5119390e-02],
      dtype=float32)

In [23]:
author_df = pd.DataFrame(author_id,columns=["author_id"])
retweet_df = pd.read_csv("authors.csv",index_col=0, dtype={'author_id':str})
author_df = author_df.merge(retweet_df,how='left')
author_df

Unnamed: 0,author_id,author_name,author_username
0,1000130222338002944,Graça Porte,GraaPortela1
1,10002992,Zissazin,taissazin
2,1000378112612470785,Robferreira🇧🇷,RobsonF91678435
3,1000457428075466754,Linda,elinalv12774504
4,1000809759187095558,Bruno Lima 97,BrunoLima972
...,...,...,...
2573,991789461485875200,M Lu! 🦄🏳️‍🌈🌈,MLPN1976
2574,993156040253468674,@NeemiasBertyno 🇧🇷🇧🇷🇧🇷🇧🇷,BertynosTata
2575,993999368511475718,joão Antonio,Joao_Aoficial
2576,994628533787025408,Jane Peixoto,janepeixoto787


In [24]:
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)

In [25]:
cluster_labels = clusterer.fit_predict(yhat)

In [26]:
cluster_labels

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [27]:
df = pd.DataFrame({'cluster_labels':cluster_labels})

clusters = df.groupby(by=df.cluster_labels)
clusters.groups

{-1: [9, 15, 17, 26, 30, 36, 52, 59, 80, 96, 99, 137, 147, 158, 159, 198, 203, 208, 217, 229, 258, 259, 260, 271, 283, 289, 292, 301, 335, 349, 353, 380, 389, 400, 402, 416, 427, 429, 445, 447, 449, 457, 461, 467, 468, 471, 485, 495, 499, 505, 511, 516, 526, 530, 537, 539, 541, 561, 565, 569, 570, 575, 591, 593, 596, 617, 626, 637, 638, 640, 682, 692, 696, 709, 710, 715, 722, 736, 744, 751, 798, 801, 808, 811, 821, 824, 836, 841, 848, 850, 857, 858, 874, 876, 900, 903, 906, 912, 957, 977, ...], 0: [225, 352, 408, 420, 550, 562, 669, 727, 769, 1339, 1545, 1593, 1674, 1806, 1971, 2065, 2105, 2376, 2390], 1: [47, 186, 382, 574, 905, 1080, 1320, 1486, 1608, 2041, 2486], 2: [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92

In [28]:
# cluster_list = [48, 187, 232, 334, 431, 595, 621, 764, 829, 859, 956, 1613, 1676, 2260, 2418, 2472]

In [29]:
cluster_list = list(clusters.get_group(-1).index)

In [30]:
author_df[author_df.index.isin(cluster_list)]

Unnamed: 0,author_id,author_name,author_username
9,100615509,@SOS_RIO,SOS_RIO
15,1011817655957893120,Rstats,rstatstweet
17,1012469182359842816,LJ maior escândalo judic da história da humani...,Mariademirai
26,1017960921107398661,Milionário,ganbiarrero
30,1020263844390875136,Dinda Maria 🇧🇷🚩🚩🚩,sampa_vip
...,...,...,...
2531,962789073638383616,JôJôAngel🇧🇷🇧🇷🇧🇷,bonita67391798
2537,966661637859172354,ger-🇧🇷🇮🇱 🇺🇸Republicano,gvenicio
2541,97250580,Rádio Proletário,RadioProletrio
2545,974392735238119424,Fabio Soares 🇧🇷🚩,FabioSoares0
