In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model,Model
import numpy as np
import pandas as pd
import csv
from sklearn.preprocessing import MaxAbsScaler
import hdbscan

In [2]:
model = load_model("saved-model-best.hdf5")

In [3]:
latent_model = Model(inputs=model.inputs, outputs=model.layers[2].output)
latent_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_input (InputLayer)  [(None, 28092, 1)]        0         
                                                                 
 masking (Masking)           (None, 28092, 1)          0         
                                                                 
 lstm (LSTM)                 (None, 28092, 64)         16896     
                                                                 
 lstm_1 (LSTM)               (None, 16)                5184      
                                                                 
Total params: 22,080
Trainable params: 22,080
Non-trainable params: 0
_________________________________________________________________


In [4]:
with open("out.csv",'r') as dest_f:
    data_iter = csv.reader(dest_f,delimiter = ",")
    data = [np.array(data,dtype=float) for data in data_iter]
    # data = [x.reshape(x.shape[0],1,1) for x in data]
data_array = np.asarray(data, dtype = object)

In [5]:
with open("out_authors.csv",'r') as dest_f:
    data_iter = csv.reader(dest_f,delimiter = ",")
    data = [np.array(data,dtype=str) for data in data_iter]
    # data = [x.reshape(x.shape[0],1,1) for x in data]
author_id = np.asarray(data, dtype = object)
author_id

array([['1000010400946118656'],
       ['1000042745380134912'],
       ['1000130222338002944'],
       ...,
       ['999978890058706944'],
       ['999983906026901504'],
       ['999993840982482944']], dtype=object)

In [6]:
padded_array = tf.keras.preprocessing.sequence.pad_sequences(data_array,padding="post")
padded_array = padded_array.reshape((padded_array.shape[0],padded_array.shape[1],1))
scaler = MaxAbsScaler()
padded_array = scaler.fit_transform(padded_array.reshape(-1, padded_array.shape[-1])).reshape(padded_array.shape)
padded_array.shape

(11117, 28092, 1)

In [7]:
yhat = latent_model.predict(padded_array)
print(yhat.shape)
print(yhat)

(11117, 16)
[[ 0.0277121   0.01236769 -0.00647678 ... -0.03303443  0.0185124
  -0.01665536]
 [ 0.02785833  0.01247913 -0.00645102 ... -0.03319067  0.0184727
  -0.01670795]
 [ 0.02802533  0.01285665 -0.00677935 ... -0.03348193  0.01823122
  -0.01696554]
 ...
 [ 0.02739336  0.01164308 -0.00586851 ... -0.03235722  0.01900453
  -0.01620082]
 [ 0.02794638  0.01273355 -0.00671075 ... -0.03334717  0.01830301
  -0.01689139]
 [ 0.02760782  0.0121729  -0.00633164 ... -0.03285101  0.01863324
  -0.01652762]]


In [8]:
author_df = pd.DataFrame(author_id,columns=["author_id"])
retweet_df = pd.read_csv("authors_jan_fev.csv",index_col=0, dtype={'author_id':str})
author_df = author_df.merge(retweet_df,how='left')
author_df

  mask |= (ar1 == a)


Unnamed: 0,author_id,author_name,author_username
0,1000010400946118656,daniela 💆🏻‍♀️,bazaaadaqui
1,1000042745380134912,pārtÿ änîmål🇸🇻,fortebeatrizz
2,1000130222338002944,Graça Porte,GraaPortela1
3,1000168525145821186,Juraci Moreira,Juracimoreira2
4,1000288372676550656,inês,inesCsilvaa
...,...,...,...
11136,999975222974525440,Carlos Furlanetto,CarlosFurlanet2
11137,999976112,Marcelo,mc_carneiro
11138,999978890058706944,margarida.,sprousebich
11139,999983906026901504,Coronel Ricardo EB,Josericardopon7


In [9]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)

In [10]:
cluster_labels = clusterer.fit_predict(yhat)
cluster_labels

array([-1, -1, 36, ..., -1, -1, -1], dtype=int64)

In [11]:
df = pd.DataFrame({'cluster_labels':cluster_labels})

clusters = df.groupby(by=df.cluster_labels)
clusters.groups

{-1: [0, 1, 3, 4, 5, 7, 8, 12, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 36, 38, 40, 41, 42, 44, 49, 50, 52, 53, 54, 56, 58, 59, 61, 64, 65, 66, 67, 68, 69, 70, 71, 72, 76, 77, 80, 81, 83, 84, 85, 86, 87, 88, 91, 92, 94, 95, 96, 97, 98, 100, 101, 102, 107, 109, 110, 112, 115, 117, 120, 121, 123, 124, 126, 127, 128, 129, 130, 132, 133, 134, 135, 136, 138, 140, 141, 142, 143, 147, 148, 150, 151, 154, ...], 0: [173, 753, 1148, 1957, 2277, 2734, 2977, 2979, 3747, 3986, 3988, 4166, 4311, 4318, 5519, 5541, 9636, 9653, 9903, 10196, 10930], 1: [177, 1217, 1350, 1648, 2312, 4088, 4148, 4160, 4295, 4323, 5858, 6396, 6842, 7429, 7558, 8998], 2: [90, 378, 471, 791, 1048, 1247, 1312, 1405, 2017, 2143, 2207, 2394, 2399, 2486, 2601, 2799, 2806, 2962, 3044, 3079, 3240, 3285, 3293, 3307, 3368, 3499, 3554, 3610, 3628, 3682, 3693, 3735, 3804, 3851, 3928, 3972, 4016, 4050, 4202, 4223, 4236, 4256, 4305, 4340, 4358, 4642, 5028, 5053, 5476, 5478, 5560, 5578, 5630, 5726, 6647, 68

In [12]:
cluster_list = list(clusters.get_group(-1).index)
author_df[author_df.index.isin(cluster_list)]

Unnamed: 0,author_id,author_name,author_username
0,1000010400946118656,daniela 💆🏻‍♀️,bazaaadaqui
1,1000042745380134912,pārtÿ änîmål🇸🇻,fortebeatrizz
3,1000168525145821186,Juraci Moreira,Juracimoreira2
4,1000288372676550656,inês,inesCsilvaa
5,10002992,Zissazin,taissazin
...,...,...,...
11112,994360452506374146,Mario Rodrigues de Oliveira,marioro23926206
11113,994628533787025408,Jane Peixoto,janepeixoto787
11114,994890695004639233,Trabalhista e Progressista 🌹💚 ⚖️🏡🆘,GuilhermePenta
11115,995066859824271363,Alessandro Sgarbossa,AlessandroSga11
