# SETUP

In [None]:
# mount drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/TESI/

In [None]:
%%capture
!pip install -U plotly
!pip install git+https://github.com/RaffaeleMorganti/gsdmm.git

In [None]:
import pandas as pd, numpy as np, numba as nb, pickle, copy
import plotly.express as px, plotly.graph_objects as go
from gsdmm import GSDMM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from scipy.cluster import hierarchy

In [None]:
%cd testi/parquet
esp = pd.read_parquet("ESPERTI_PS.pqt")
ist = pd.read_parquet("ISTITUZIONI_PS.pqt")
reg = pd.read_parquet("REGIONI_PS.pqt")
new = pd.read_parquet("NEWS_PS.pqt")
twi = twi = pd.concat((pd.read_parquet("TWEET_P.pqt"),pd.read_parquet("TWEET_S.pqt")),1)
%cd ../..

In [None]:
esp["dataset"] = "Esperti"
ist["dataset"] = "Istituzioni"
reg["dataset"] = "Regioni"
twi["dataset"] = "Tweet"
new["dataset"] = "Notizie"
new.rename(columns={"date":"datetime"},inplace=True)

In [None]:
newSub = new.groupby([pd.Grouper(key="datetime",freq="D")]).sample(frac=.5,random_state=1).index
regSub = reg.groupby([pd.Grouper(key="datetime",freq="D")]).sample(frac=.5,random_state=1).index
twiSub = twi.groupby([pd.Grouper(key="datetime",freq="D")]).sample(frac=.1,random_state=1).index

# GSDMM clustering

In [None]:
sub = lambda x,r=None,c=["preprocess"]: x.loc[:,c] if r is None else x.loc[r,c]
sampled = pd.concat((sub(twi,twiSub),sub(reg,regSub),sub(new,newSub),sub(esp),sub(ist)),ignore_index=True)
full = pd.concat((sub(twi),sub(reg),sub(new),sub(esp),sub(ist)),ignore_index=True)

In [None]:
gsdmm = GSDMM(clust=200,n_iters=30,alpha=0.1,beta=0.1,seed=1,verbose=True,min_df=1e-3,token_pattern=r"(?u)\b(?<!\.|\/|\?|#)\w{3,}(?!:|\.)\b")
cat = gsdmm.fit(sampled.preprocess)
with open("MODELS/GSDMM.pkl","wb") as f:
  pickle.dump(gsdmm,f)

In [None]:
with open("MODELS/GSDMM.pkl","rb") as f:
  gsdmm = pickle.load(f)

In [None]:
imp = gsdmm.get_avg_importances()
for i in range(8):
  gsdmm.get_wordclouds(imp[i*25:(i+1)*25],plot={'figsize':(18,10)},ncol=5,names=i*25 + np.arange(25)).savefig("FILES/clust%d.jpg"%i)

In [None]:
def clusters(d, s, l="average"):
    m = s.shape[0]
    name = np.arange(m)
    count = np.ones(m)
    clust = np.zeros((m-1 ,4))
    np.fill_diagonal(d,np.inf)
    for k in range(m-1):
        col, row = np.unravel_index(np.argmin(d), d.shape)
        count[col] += count[row]
        clust[k,:] = np.array([name[col] , name[row], d.min(), count[col]])
        n = s[row] + s[col]
        if n != 0:
            if l == "single":
              temp = d[[col,row],:].min(0)
            if l == "complete":
              temp = d[[col,row],:].max(0)
            if l == "average":
              temp = (d[col,:]*s[col] + d[row,:]*s[row]) / n
            if l == "weighted":
              temp = (d[col,:] + d[row,:]) / 2
            if l == "ward":
              temp = ((s+s[col])*d[col,:] + (s+s[row])*d[row,:] - s*d[col,row]) / (n+s)
            temp  = np.nan_to_num(temp, nan=np.inf)
            d[col,:], d[:,col] = temp, temp
            d[col,col] = np.inf
        d[row,:], d[:,row] = np.inf, np.inf
        name[col], s[col] = k + m, n
        
    return clust

def clustGSDMM(original, cluster):
  n = max(cluster)
  cdc = original._GSDMM__clust_doc_count
  cwc = original._GSDMM__clust_word_count
  tm = original._GSDMM__train_matrix
  model = copy.deepcopy(original)
  model._GSDMM__K = n
  model._GSDMM__clust_doc_count = np.zeros(n, int)
  model._GSDMM__clust_word_count = np.zeros(n, int)
  model._GSDMM__train_matrix = np.zeros((n, tm.shape[1]))
  for i in range(n):
    sel = cluster == (i + 1)
    model._GSDMM__clust_doc_count[i] = cdc[sel].sum()
    model._GSDMM__clust_word_count[i] = cwc[sel].sum()
    model._GSDMM__train_matrix[i:] = tm[sel].sum(0)
  return model


In [None]:
imp = gsdmm.get_avg_importances(True)[0]
mat = np.nan_to_num(imp / imp.sum(1).reshape((imp.shape[0],1)))
x = clusters( pairwise_distances(mat,metric='l1'),
              gsdmm.get_clust_info()[:,0],"ward")

In [None]:
plt.figure(figsize=(20,5))
q=hierarchy.dendrogram(x,color_threshold=2.2,)

In [None]:
new = clustGSDMM(gsdmm,hierarchy.fcluster(x,20,"maxclust"))

In [None]:
new.get_wordclouds(new.get_avg_importances(),4,plot={'figsize':(14,10)}).savefig("FILES/clustSMALL.jpg")

In [None]:
with open("MODELS/GSDMM20.pkl","wb") as f:
  pickle.dump(new, f)

#Plot

In [None]:
sub = lambda x: x.loc[:,["pos"]]
merged = pd.concat((sub(twi),sub(reg),sub(new),sub(esp),sub(ist)),ignore_index=True)

In [None]:
def stackDF(pos):
  df = pd.DataFrame(list(pos))
  del df["PUNCT"], df["SPACE"], df["NUM"], df["SYM"], df["INTJ"], df["PRON"], df["X"]
  for col in df:
    df[col] = df[col].str.join(" ")
  return df

In [None]:
df = stackDF(reg.pos)

In [None]:
def clustFreq(df,category,freq="M"):
  clust = df.topic[df.dataset==category]
  dates = df.datetime[df.dataset==category]     
  clust.name = "clust"
  df = pd.DataFrame({"dt":dates,"cl":clust})
  groups = df.groupby([pd.Grouper(key="dt",freq=freq),clust]).count().reset_index()
  fig = px.area(groups,
                x="dt", y="cl", groupnorm="percent",
                color="clust", line_group="clust",
                category_orders={"clust":list(range(24))},
                color_discrete_map=dict(zip(range(24),px.colors.qualitative.Light24)),
                range_y=(0,100), line_shape="spline")
  return fig


In [None]:
clustFreq(full,"Esperti").show()
clustFreq(full,"Istituzioni").show()
clustFreq(full,"Regioni","2W").show()
clustFreq(full,"Tweet","2W").show()
clustFreq(full,"Notizie","2W").show()