# SETUP

In [None]:
# mount drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/TESI/

In [None]:
%%capture
!pip install -U plotly
!pip install umap-learn

In [None]:
import pandas as pd, numpy as np, numba as nb, pickle
import plotly.express as px, plotly.graph_objects as go
from umap.parametric_umap import ParametricUMAP, load_ParametricUMAP
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
%cd testi/parquet
esp = pd.read_parquet("ESPERTI_PS.pqt")
ist = pd.read_parquet("ISTITUZIONI_PS.pqt")
reg = pd.read_parquet("REGIONI_PS.pqt")
new = pd.read_parquet("NEWS_PS.pqt")
twi = pd.concat((pd.read_parquet("TWEET_P.pqt"),pd.read_parquet("TWEET_S.pqt")),1)
%cd ../..

In [None]:
esp["dataset"] = "Esperti"
ist["dataset"] = "Istituzioni"
reg["dataset"] = "Regioni"
twi["dataset"] = "Tweet"
new["dataset"] = "Notizie"
new.rename(columns={"date":"datetime"},inplace=True)

In [None]:
newSub = new.groupby([pd.Grouper(key="datetime",freq="D")]).sample(frac=.5,random_state=1).index
regSub = reg.groupby([pd.Grouper(key="datetime",freq="D")]).sample(frac=.5,random_state=1).index
twiSub = twi.groupby([pd.Grouper(key="datetime",freq="D")]).sample(frac=.1,random_state=1).index

# UMAP

In [None]:
sampledSet = np.concatenate((
    np.stack(twi.doc_vector[twiSub]),
    np.stack(reg.doc_vector[regSub]),
    np.stack(new.doc_vector[newSub]),
    np.stack(esp.doc_vector),np.stack(ist.doc_vector)
    ))

In [None]:
model = ParametricUMAP(metric="cosine",n_neighbors=5,disconnection_distance=1,
              verbose=True,random_state=2021,
              loss_report_frequency=1,n_training_epochs=20,batch_size=1000)
model.fit(sampledSet)
model.save("MODELS/UMAP")

In [None]:
e_umap = load_ParametricUMAP("MODELS/UMAP")
twiEmb = e_umap.transform(np.stack(twi.doc_vector))
regEmb = e_umap.transform(np.stack(reg.doc_vector))
newEmb = e_umap.transform(np.stack(new.doc_vector))
espEmb = e_umap.transform(np.stack(esp.doc_vector))
istEmb = e_umap.transform(np.stack(ist.doc_vector))

fitted = np.r_[twiEmb,regEmb,newEmb,espEmb,istEmb]

In [None]:
print(fitted.min(0))
print(fitted.max(0))

In [None]:
@nb.njit(error_model='numpy',parallel=True)
def pool2d_calc(m2d,shape,N):
  out = np.empty((shape))
  for i in nb.prange(0, shape[0]):
      for j in range(0, shape[1]):
          out[i,j]=m2d[i:i+N, j:j+N].mean()
  return out
pool2d = lambda matrix,size=3: pool2d_calc(np.pad(matrix,(
                                      int(np.floor((size-1)/2)),
                                      int(np.floor(size/2))
                                    )),matrix.shape,size)

def plotContour(coords,title="TITLE",zscale=.7,zmax=.1,nlabs=5,legend=False):
  ticks = np.linspace(0,zmax,nlabs)
  hist = np.histogram2d(coords[:,0], coords[:,1], bins=(51,41), range=((-16.5,9),(-14.5,6)),density=True)[0].T
  return go.Figure(go.Heatmap(z=pool2d(hist,2)**zscale,zsmooth='best',zmax=zmax**zscale,zmin=0,
                              x0=-16.25, y0=-14.25, dx=.5, dy=.5, text=hist*100,
                              colorbar=dict(tickmode="array", tickvals=ticks**zscale,
                                            ticktext=["%.1f%%" % i for i in ticks*100]),
                              hovertemplate = "x: %{x}<br>y: %{y}<br>Density: %{text:.2f}%<extra></extra>",
                              showscale=legend), layout_title=title, layout_title_x=.5, layout_title_y=.87,
                   layout_width=680, layout_height=600, layout_yaxis = dict(scaleanchor = "x", scaleratio = 1))


In [None]:
plotContour(twiEmb,"Tweet",legend=True).show()

In [None]:
plotContour(twiEmb,"Tweet").show()
plotContour(newEmb,"Notizie").show()
plotContour(istEmb,"Istituzioni").show()
plotContour(regEmb,"Regioni").show()
plotContour(espEmb,"Esperti").show()

In [None]:
sub = lambda x: x.loc[:,["preprocess","datetime","dataset"]]
full = pd.concat((sub(twi),sub(reg),sub(new),sub(esp),sub(ist)),ignore_index=True)


# WordMap

In [None]:
#@title NormalEllipse
 
class NormalEllipse:
    def __init__(self, data=None, weights=None, mu=None, sigma=None):
        """
        initialize with binormal data, requires:
        data: 2d array with raw data
        or
        mu: means vector of size 2 
        sigma: 2x2 variance-covariance matrix
        """
        if data is None:
            self.__mu = mu
            self.__cov = sigma
        else:
            if data.shape[1] == 2:
                data = data.T
            self.__mu = np.average(data,1,weights)
            self.__cov = np.cov(data,aweights=weights)

        self.__p = -1
        D,V = np.linalg.eigh(self.__cov)
        self.__angle = np.arctan2(*V[::-1, 0])
        self.__angles = {"cos": -np.cos(self.__angle), "sin": np.sin(self.__angle)}

    def params(self, p=1-np.exp(-1/2)):
        """
        return mu(x,y), radius(x,y), angle(rad)
        params:
            p: normal percentile (default None)
        """
        self.__params(p)
        return self.__mu, self.__radius, self.__angle

    def __params(self, p):
        if self.__p != p:
            self.__p = p
            self.__radius = np.sqrt(np.linalg.eigvalsh(self.__cov * -2 * np.log(1 - p)))

    def points(self, p=1-np.exp(-1/2), points=100):
        """
        return 2d array of points to draw contour ellipsis
        params:
            p: normal percentile (default None)
            points: maximum number of points to return (exact if points % 8 == 1)
        """
        self.__params(p)

        pt = np.ceil(points / 8)
        pt = np.linspace(0, np.sqrt(0.5), int(pt))**2
        pt = np.r_[pt[:-1], 1 - pt[::-1]]

        x = np.sqrt(pt) * self.__radius[0]
        y = np.sqrt((1 - pt) * self.__radius[1]**2)
        x = np.r_[ x[:-1], x[::-1][:-1], -x[:-1], -x[::-1]]
        y = np.r_[ y[:-1], -y[::-1][:-1], -y[:-1], y[::-1]]

        xpt = self.__mu[0] + (y * self.__angles["sin"] + x * self.__angles["cos"])
        ypt = self.__mu[1] + (y * self.__angles["cos"] - x * self.__angles["sin"])

        return np.c_[xpt,ypt]

    def inside(self, points, p=1-np.exp(-1/2)):
        """
        return binary array, True for points inside ellipsis
        params:
            points: 2d array of coordinates
            p: normal percentile (default None)
        """
        self.__params(p)
        
        xc= points[:,0] - self.__mu[0]
        yc= points[:,1] - self.__mu[1]

        xct = xc * self.__angles["cos"] - yc * self.__angles["sin"]
        yct = xc * self.__angles["sin"] + yc * self.__angles["cos"]

        return ((xct/self.__radius[0])**2 + (yct/self.__radius[1])**2) <= 1


In [None]:
with open("MODELS/GSDMM.pkl","rb") as f:
  gsdmm = pickle.load(f)

In [None]:
sub = lambda x: x.loc[:,["pos"]]
merged = pd.concat((sub(twi),sub(reg),sub(new),sub(esp),sub(ist)),ignore_index=True)

In [None]:
def stackDF(pos):
  df = pd.DataFrame(list(pos))
  del df["PUNCT"], df["SPACE"], df["NUM"], df["SYM"], df["INTJ"], df["PRON"], df["X"]
  for col in df:
    df[col] = df[col].str.join(" ")
  return df

In [None]:
df = stackDF(reg.pos)

In [None]:
#2554 .001
# 618 .005
# 253 .01
#  16 .05
#   6 .1

In [None]:
#@title Word Distribution in UMAP plot
def wordDistribution(words,coords,weights=None,points=50,p=.9,**kwargs):
  cv = CountVectorizer(**kwargs)
  mat = cv.fit_transform(words)
  sel = mat.toarray()
  par = [0]*sel.shape[1]
  pts = [0]*sel.shape[1]
  names = cv.get_feature_names()
  for i in range(sel.shape[1]):
    pos = sel[:,i]!=0
    n = NormalEllipse(coords[pos,:],weights[pos])
    pts[i] = n.points(points=points,p=p)
    par[i] = np.r_[n.params()]
  return names, par, pts

def wordDistrPlot(names, par=None, pts=None):
  fig = go.Figure(layout_xaxis_range=(-13.5,7), layout_yaxis_range=(-16.5,6.5))
  fig.update_yaxes(scaleanchor = "x", scaleratio = 1)

  if pts is not None:
    for i in range(len(names)):
      fig.add_trace(go.Scatter(
          x=pts[i][:,0],y=pts[i][:,1],
          opacity=.3, showlegend=False,
          name = names[i], hoverinfo ="text", text=names[i],
          line_color=px.colors.qualitative.Alphabet[i%26],
          mode="lines", legendgroup="g%d"%i
          ))
  if par is not None:
    for i in range(len(names)):
      fig.add_trace(go.Scatter(
          x=[par[i][0]], y=[par[i][1]], 
          name = names[i], hoverinfo ="text", text=[names[i]],
          marker_color=px.colors.qualitative.Alphabet[i%26],
          mode="markers", legendgroup="g%d"%i
          ))

  return fig

In [None]:
def scatterBy(coords,data,grouper):
  data = pd.concat((pd.DataFrame(coords).rename(columns={0:"x",1:"y"}),data),1)
  fig = px.scatter(data, x="x", y="y", color=grouper, range_x=(-16.5,9),range_y=(-14.5,6))
  fig.update_yaxes(dict(scaleanchor = "x", scaleratio = 1))
  return fig

In [None]:
scatterBy(istEmb,ist.user,"user")

In [None]:
scatterBy(regEmb,reg.user,"user")

In [None]:
sub = lambda x,r=None,c=["preprocess","dataset","doc_vector"]: x.loc[:,c] if r is None else x.loc[r,c]
full = pd.concat((sub(twi,twiSub),sub(reg,regSub),sub(new,newSub),sub(esp),sub(ist)),ignore_index=True)
full["weight"] = 1e3/full.groupby("dataset").transform("count").preprocess
fullEmb = e_umap.transform(np.stack(full.doc_vector))

In [None]:
names, par, pts = wordDistribution(full.preprocess,fullEmb,np.array(full.weight),min_df=1e-3,token_pattern=r"(?u)\b(?<!\.|\/|\?|#)\w{3,}(?!:|\.)\b")

In [None]:
pd.DataFrame(names).to_csv("../../parole.csv")

In [None]:
# plottare solo outliers
fig = wordDistrPlot(names, par)
#fig.write_html("../TEST.html")
fig.show()