In [2]:
import tensorflow as tf

from tensorflow.keras import layers, losses
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input,LSTM,RepeatVector,Dense,Lambda,TimeDistributed,Masking
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.utils import normalize

import numpy as np
import pandas as pd
import csv
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler,StandardScaler
import glob,os
from pathlib import Path
from scipy import stats

In [3]:
# with open("out.csv",'r') as dest_f:
#     data_iter = csv.reader(dest_f,delimiter = ",")
#     data = [np.array(data,dtype=int) for data in data_iter]
#     # data = [x.reshape(x.shape[0],1,1) for x in data]
# data_array = np.asarray(data, dtype = object)
# data_array
# padded_array2 = tf.keras.preprocessing.sequence.pad_sequences(data_array2,padding="post")

In [4]:
data_array = []
author_id = []
all_files = glob.glob(os.path.join("time_series/", "*.csv"))   
f_array = [] 
for f in all_files:
    # data_iter = csv.reader(f,delimiter = "\n")
    f_array = pd.read_csv(f,dtype=int,header=None).transpose().squeeze().to_numpy()
    data_array.append(f_array)
    author_id.append(Path(f).stem)

data_array = np.array(data_array,dtype = object)

In [5]:
padded_array = tf.keras.preprocessing.sequence.pad_sequences(data_array,padding="post")
# padded_array = np.transpose(padded_array)

In [6]:
# reshape input into [samples, timesteps, features]
padded_array = padded_array.reshape((padded_array.shape[0],padded_array.shape[1],1))
padded_array.shape

(2577, 2754, 1)

In [7]:
padded_array

array([[[ -5729],
        [ 10043],
        [   -30],
        ...,
        [     0],
        [     0],
        [     0]],

       [[-66825],
        [ 40625],
        [  -799],
        ...,
        [     0],
        [     0],
        [     0]],

       [[-61303],
        [ 52578],
        [  -472],
        ...,
        [     0],
        [     0],
        [     0]],

       ...,

       [[-37978],
        [ 29976],
        [ -1810],
        ...,
        [     0],
        [     0],
        [     0]],

       [[ -6164],
        [  2544],
        [  -129],
        ...,
        [     0],
        [     0],
        [     0]],

       [[ -6854],
        [ 21276],
        [-53478],
        ...,
        [     0],
        [     0],
        [     0]]])

In [8]:
scaler = MaxAbsScaler()
padded_array = scaler.fit_transform(padded_array.reshape(-1, padded_array.shape[-1])).reshape(padded_array.shape)

In [9]:
scaler.transform([[0]])

array([[0.]])

In [10]:
padded_array = np.where(padded_array==scaler.transform([[0]]), 0, padded_array)

In [11]:
padded_array

array([[[-1.07479718e-04],
        [ 1.88413127e-04],
        [-5.62819259e-07],
        ...,
        [ 0.00000000e+00],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.25367990e-03],
        [ 7.62151080e-04],
        [-1.49897529e-05],
        ...,
        [ 0.00000000e+00],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.15008363e-03],
        [ 9.86397034e-04],
        [-8.85502301e-06],
        ...,
        [ 0.00000000e+00],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       ...,

       [[-7.12491661e-04],
        [ 5.62369004e-04],
        [-3.39567620e-05],
        ...,
        [ 0.00000000e+00],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.15640597e-04],
        [ 4.77270732e-05],
        [-2.42012281e-06],
        ...,
        [ 0.00000000e+00],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.28585440e-04],
        [ 3.99151419e-04],
        [-1.00328161e-03],
        ...,
        

In [42]:
timesteps = padded_array.shape[1]  # Length of your sequences
features = padded_array.shape[2]
latent_dim = 16
inter_dim = 64

model = Sequential()
model.add(Masking(mask_value=0, input_shape=(timesteps, features)))
model.add(LSTM(inter_dim, activation='tanh',return_sequences=True))
model.add(LSTM(latent_dim, activation='tanh',return_sequences=False))
model.add(RepeatVector(timesteps))
model.add(LSTM(latent_dim, activation='tanh', return_sequences=True))
model.add(LSTM(inter_dim, activation='tanh', return_sequences=True))
model.add(TimeDistributed(Dense(1,activation="tanh")))

# model.compile(optimizer='nadam', loss='mean_absolute_error',metrics=['accuracy'])
model.compile(optimizer='nadam', loss='mse',metrics=['accuracy'])
# model.compile(optimizer='nadam', loss=losses.categorical_crossentropy,metrics=['accuracy'])

In [43]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_3 (Masking)         (None, 2754, 1)           0         
                                                                 
 lstm_12 (LSTM)              (None, 2754, 64)          16896     
                                                                 
 lstm_13 (LSTM)              (None, 16)                5184      
                                                                 
 repeat_vector_3 (RepeatVect  (None, 2754, 16)         0         
 or)                                                             
                                                                 
 lstm_14 (LSTM)              (None, 2754, 16)          2112      
                                                                 
 lstm_15 (LSTM)              (None, 2754, 64)          20736     
                                                      

In [44]:
model.fit(padded_array,padded_array, epochs=5,batch_size=256)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2db5b4659a0>

In [45]:
model.save("trained_model.h5")

In [46]:
test = padded_array[0]
test = test.reshape((1,test.shape[0],test.shape[1]))
test.shape

(1, 2754, 1)

In [47]:
test

array([[[-1.07479718e-04],
        [ 1.88413127e-04],
        [-5.62819259e-07],
        ...,
        [ 0.00000000e+00],
        [ 0.00000000e+00],
        [ 0.00000000e+00]]])

In [48]:
yhat = model.predict(test)
print(yhat.shape)
print(yhat)

(1, 2754, 1)
[[[-0.00052311]
  [-0.00079816]
  [-0.00099661]
  ...
  [-0.00156379]
  [-0.00156379]
  [-0.00156379]]]


In [49]:
latent_model = Model(inputs=model.inputs, outputs=model.layers[2].output)
latent_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_3_input (InputLayer  [(None, 2754, 1)]        0         
 )                                                               
                                                                 
 masking_3 (Masking)         (None, 2754, 1)           0         
                                                                 
 lstm_12 (LSTM)              (None, 2754, 64)          16896     
                                                                 
 lstm_13 (LSTM)              (None, 16)                5184      
                                                                 
Total params: 22,080
Trainable params: 22,080
Non-trainable params: 0
_________________________________________________________________


In [50]:
yhat = latent_model.predict(test)
print(yhat.shape)
print(yhat)

(1, 16)
[[-1.10528585e-04  1.99928484e-03 -4.99854621e-04 -4.29386244e-04
   7.52623950e-04  9.39589343e-04 -3.05009802e-04  5.46530937e-04
   2.57382693e-04 -1.33266463e-03  2.70463090e-04  6.47877168e-05
   8.72704957e-04 -1.50392740e-03 -7.91431114e-04  3.06083879e-04]]


In [51]:
yhat = latent_model.predict(padded_array)
print(yhat.shape)
print(yhat)

(2577, 16)
[[-1.10528694e-04  1.99928484e-03 -4.99854563e-04 ... -1.50392740e-03
  -7.91430939e-04  3.06083879e-04]
 [-1.38585980e-04  1.98296341e-03 -5.37110609e-04 ... -1.52525119e-03
  -8.35380575e-04  2.78159539e-04]
 [-1.05941799e-04  2.01873691e-03 -4.30637767e-04 ... -1.59339060e-03
  -7.58526905e-04  2.56383588e-04]
 ...
 [-9.60048637e-05  1.99999567e-03 -4.19092947e-04 ... -1.52455771e-03
  -7.63957214e-04  3.07246839e-04]
 [-1.17536620e-04  2.04112031e-03 -4.98300244e-04 ... -1.62801030e-03
  -7.58779061e-04  2.16820816e-04]
 [-1.03645223e-04  1.97955454e-03 -4.36787988e-04 ... -1.48611027e-03
  -7.90578546e-04  3.27935209e-04]]


In [52]:
yhat[98]

array([-8.2630126e-05,  1.9979086e-03, -4.1578792e-04, -3.8343939e-04,
        7.7886792e-04,  1.0148585e-03, -4.0159855e-04,  6.1453332e-04,
        3.9084116e-04, -1.3173305e-03,  3.7004720e-04,  6.4609099e-05,
        7.9759670e-04, -1.4740201e-03, -7.4849121e-04,  3.3833893e-04],
      dtype=float32)

In [74]:
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=2)

In [75]:
cluster_labels = clusterer.fit_predict(yhat)

In [76]:
cluster_labels

array([ 94,  -1, 209, ..., 171,  -1, 130], dtype=int64)

In [87]:
df = pd.DataFrame({'cluster_labels':cluster_labels})

clusters = df.groupby(by=df.cluster_labels)
clusters.groups

{-1: [1, 3, 7, 8, 9, 10, 11, 15, 19, 20, 23, 25, 26, 27, 28, 29, 31, 32, 33, 34, 36, 37, 38, 39, 41, 42, 44, 46, 47, 55, 56, 62, 63, 67, 68, 72, 76, 79, 80, 86, 88, 89, 90, 92, 93, 94, 95, 98, 102, 104, 106, 108, 112, 115, 118, 122, 123, 127, 132, 133, 136, 137, 138, 141, 142, 145, 146, 148, 149, 151, 153, 155, 156, 158, 159, 160, 166, 167, 170, 171, 183, 195, 196, 198, 204, 206, 208, 214, 215, 217, 220, 221, 226, 229, 231, 233, 234, 239, 243, 247, ...], 0: [1581, 2529, 2545], 1: [259, 561, 1425, 1656], 2: [99, 751, 1906], 3: [516, 591, 596], 4: [526, 1129], 5: [283, 1148, 1478, 1867], 6: [17, 427, 617, 986, 1727, 2526], 7: [499, 575, 2400], 8: [203, 2075], 9: [710, 1415, 1533, 2220], 10: [271, 570, 744, 912, 1493, 1519, 1914], 11: [416, 539, 841], 12: [258, 301, 696, 824, 1254, 1405, 1501, 1765], 13: [408, 420, 1677, 2376], 14: [550, 769, 2105], 15: [1339, 1545, 1674], 16: [225, 1294, 1891, 2228], 17: [1142, 1466], 18: [30, 147, 400, 402, 530, 640, 1052, 1158, 1214, 1362, 1593, 1681, 

In [94]:
cluster_list = list(clusters.get_group(-1).index)

In [64]:
author_df = pd.DataFrame(author_id,columns=["author_id"])
retweet_df = pd.read_csv("authors.csv",index_col=0, dtype={'author_id':str})
author_df = author_df.merge(retweet_df,how='left')
author_df

Unnamed: 0,author_id,author_name,author_username
0,1000130222338002944,Graça Porte,GraaPortela1
1,10002992,Zissazin,taissazin
2,1000378112612470785,Robferreira🇧🇷,RobsonF91678435
3,1000457428075466754,Linda,elinalv12774504
4,1000809759187095558,Bruno Lima 97,BrunoLima972
...,...,...,...
2573,991789461485875200,M Lu! 🦄🏳️‍🌈🌈,MLPN1976
2574,993156040253468674,@NeemiasBertyno 🇧🇷🇧🇷🇧🇷🇧🇷,BertynosTata
2575,993999368511475718,joão Antonio,Joao_Aoficial
2576,994628533787025408,Jane Peixoto,janepeixoto787


In [93]:
# cluster_list = [48, 187, 232, 334, 431, 595, 621, 764, 829, 859, 956, 1613, 1676, 2260, 2418, 2472]

In [95]:
author_df[author_df.index.isin(cluster_list)]

Unnamed: 0,author_id,author_name,author_username
1,10002992,Zissazin,taissazin
3,1000457428075466754,Linda,elinalv12774504
7,1002737807331381248,Cecílio,Ceclio16
8,1003801362826776576,PL 490 NÃO,Nitrogl41152579
9,100615509,@SOS_RIO,SOS_RIO
...,...,...,...
2566,984106671558537217,Sônia Virgínia🇧🇷⚖,SoniaVirginia14
2568,988262919950536704,Zé Dias Estrela Da Noite⭐ 🌍🌃⭐⭐🌃🍂,EstrelaDANoite8
2569,989911675959742464,Luiza🇸🇾🇧🇷,Luizapimenta12
2571,991369128081534976,mauriciotio389@gmail.com,ma212424
