# SETUP

In [None]:
# mount drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/TESI/

In [None]:
%%capture
!pip install -U plotly
!pip install -U scipy
!pip install umap-learn
!pip install git+https://github.com/RaffaeleMorganti/gsdmm.git

In [None]:
import pandas as pd, numpy as np, numba as nb, pickle
import plotly.express as px, plotly.graph_objects as go
from umap.parametric_umap import load_ParametricUMAP
from gsdmm import GSDMM
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
%cd testi/parquet
esp = pd.read_parquet("ESPERTI_PS.pqt")
ist = pd.read_parquet("ISTITUZIONI_PS.pqt")
reg = pd.read_parquet("REGIONI_PS.pqt")
new = pd.read_parquet("NEWS_PS.pqt")
twi = twi = pd.concat((pd.read_parquet("TWEET_P.pqt"),pd.read_parquet("TWEET_S.pqt")),1)
%cd ../..

In [None]:
esp["dataset"] = "Esperti"
ist["dataset"] = "Istituzioni"
reg["dataset"] = "Regioni"
twi["dataset"] = "Tweet"
new["dataset"] = "Notizie"
new.rename(columns={"date":"datetime"},inplace=True)

In [None]:
sub = lambda x,r=None,c=["dataset","datetime","preprocess","doc_vector","sentiment","emotion"]: x.loc[:,c] if r is None else x.loc[r,c]
full = pd.concat((sub(twi),sub(reg),sub(new),sub(esp),sub(ist)),ignore_index=True)
full["weight"] = 1e3/full.groupby("dataset").transform("count").preprocess

In [None]:
umap = load_ParametricUMAP("MODELS/UMAP")
fullEmb = umap.transform(np.stack(full.doc_vector))
full["x"] = fullEmb[:,0]
full["y"] = fullEmb[:,1]

In [None]:
with open("MODELS/GSDMM24.pkl","rb") as f:
  gsdmm = pickle.load(f)
full["topic"] = gsdmm.predict(full.preprocess)

# UMAP cluseter

In [None]:
#@title NormalEllipse
 
class NormalEllipse:
    def __init__(self, data=None, weights=None, mu=None, sigma=None):
        """
        initialize with binormal data, requires:
        data: 2d array with raw data
        or
        mu: means vector of size 2 
        sigma: 2x2 variance-covariance matrix
        """
        if data is None:
            self.__mu = mu
            self.__cov = sigma
        else:
            if data.shape[1] == 2:
                data = data.T
            self.__mu = np.average(data,1,weights)
            self.__cov = np.cov(data,aweights=weights)

        self.__p = -1
        D,V = np.linalg.eigh(self.__cov)
        self.__angle = np.arctan2(*V[::-1, 0])
        self.__angles = {"cos": -np.cos(self.__angle), "sin": np.sin(self.__angle)}

    def params(self, p=1-np.exp(-1/2)):
        """
        return mu(x,y), radius(x,y), angle(rad)
        params:
            p: normal percentile (default None)
        """
        self.__params(p)
        return self.__mu, self.__radius, self.__angle

    def __params(self, p):
        if self.__p != p:
            self.__p = p
            self.__radius = np.sqrt(np.linalg.eigvalsh(self.__cov * -2 * np.log(1 - p)))

    def points(self, p=1-np.exp(-1/2), points=100):
        """
        return 2d array of points to draw contour ellipsis
        params:
            p: normal percentile (default None)
            points: maximum number of points to return (exact if points % 8 == 1)
        """
        self.__params(p)

        pt = np.ceil(points / 8)
        pt = np.linspace(0, np.sqrt(0.5), int(pt))**2
        pt = np.r_[pt[:-1], 1 - pt[::-1]]

        x = np.sqrt(pt) * self.__radius[0]
        y = np.sqrt((1 - pt) * self.__radius[1]**2)
        x = np.r_[ x[:-1], x[::-1][:-1], -x[:-1], -x[::-1]]
        y = np.r_[ y[:-1], -y[::-1][:-1], -y[:-1], y[::-1]]

        xpt = self.__mu[0] + (y * self.__angles["sin"] + x * self.__angles["cos"])
        ypt = self.__mu[1] + (y * self.__angles["cos"] - x * self.__angles["sin"])

        return np.c_[xpt,ypt]

    def inside(self, points, p=1-np.exp(-1/2)):
        """
        return binary array, True for points inside ellipsis
        params:
            points: 2d array of coordinates
            p: normal percentile (default None)
        """
        self.__params(p)
        
        xc= points[:,0] - self.__mu[0]
        yc= points[:,1] - self.__mu[1]

        xct = xc * self.__angles["cos"] - yc * self.__angles["sin"]
        yct = xc * self.__angles["sin"] + yc * self.__angles["cos"]

        return ((xct/self.__radius[0])**2 + (yct/self.__radius[1])**2) <= 1


In [None]:
#@title Word Distribution in UMAP plot (sentiment e cluster)

layout = dict(layout_xaxis=dict(range=(-13.5,7), gridcolor="#666666", zeroline=False),
              layout_yaxis=dict(range=(-16.5,6.5), gridcolor="#666666", zeroline=False, scaleanchor="x", scaleratio=1),
              layout_plot_bgcolor="#333333", layout_width=800, layout_height=600)

def clustDistribution(coords,clust,weights=None,points=50,p=.9):
  clusters = clust.unique()
  par = [0]*len(clusters)
  pts = [0]*len(clusters)
  for k,i in enumerate(clusters):
    pos = clust == i
    n = NormalEllipse(coords[pos,:],weights[pos])
    pts[k] = n.points(points=points,p=p)
    par[k] = np.r_[n.params()]
  return clusters, par, pts

def clustDistPlot(clust, imp, par=None, pts=None):
  fig = go.Figure(**layout)
  names = ["%02d" % i for i in clust]
  if pts is not None:
    for i,k in enumerate(clust):
      fig.add_trace(go.Scatter(
          x=pts[i][:,0],y=pts[i][:,1],
          opacity=.3, showlegend=False,
          name = names[i], text=names[i],
          line_color=px.colors.qualitative.Light24[i%24],
          mode="lines", legendgroup="g%d"%i,fill='toself'
          ))
  if par is not None:
    for i,k in enumerate(clust):
      fig.add_trace(go.Scatter(
          x=[par[i][0]], y=[par[i][1]], 
          name = names[i], text=[names[i]], meta=(list(imp[k].keys()),list(imp[k].values())/sum(list(imp[k].values()))),
          hovertemplate = "<br>".join(["%%{meta[0][%d]}:%%{meta[1][%d]:.2f}" % (i,i) for i in range(5)]) + "<extra>%{text}</extra>",
          marker_color=px.colors.qualitative.Light24[i%24],
          mode="markers+text", legendgroup="g%d"%i,
          textposition='middle right',textfont_color=px.colors.qualitative.Light24[i%24]
          ))
  return fig

def sentiDistribution(coords,senti,weights=None,points=50,p=.9):
  nclust = senti.shape[1]
  par = [0]*nclust
  pts = [0]*nclust
  for i in range(nclust):
    n = NormalEllipse(coords, senti[:,i] * weights)
    pts[i] = n.points(points=points,p=p)
    par[i] = np.r_[n.params()]
  return par, pts

def sentiDistPlot(names, par=None, pts=None):
  fig = go.Figure(**layout)
  if pts is not None:
    for i in range(len(names)):
      fig.add_trace(go.Scatter(
          x=pts[i][:,0],y=pts[i][:,1],
          opacity=.5, showlegend=False,
          name = names[i], text=names[i], hoverinfo ="text",
          mode="lines", legendgroup="g%d"%i,
          line_color=px.colors.qualitative.Pastel[i%11],
          fill='toself'
          ))
  if par is not None:
    for i in range(len(names)):
      fig.add_trace(go.Scatter(
          x=[par[i][0]], y=[par[i][1]], 
          name = names[i], text=names[i], hoverinfo ="text",
          mode="markers", legendgroup="g%d"%i,
          marker_color=px.colors.qualitative.Pastel[i%11],
          ))
  return fig

In [None]:
par, pts = sentiDistribution(fullEmb,np.stack(full.sentiment),np.array(full.weight),p=.99)
fig = sentiDistPlot(["Pos","Neu","Neg",], par, pts)
fig.show()
par, pts = sentiDistribution(fullEmb,np.stack(full.emotion),np.array(full.weight),p=.99)
fig = sentiDistPlot(["rabbia","paura","gioia","tristezza",], par, pts)
fig.show()

In [None]:
names, par, pts = clustDistribution(fullEmb,full.topic,np.array(full.weight))
imp = gsdmm.get_avg_importances()
fig = clustDistPlot(names, imp, par, pts)
#fig.write_html("../TEST.html")
fig.show()