# SETUP

In [None]:
# mount drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/TESI/

In [None]:
%%capture
!pip install -U plotly
!pip install -U scipy

In [None]:
import pandas as pd, numpy as np, numba as nb
import plotly.express as px, plotly.graph_objects as go
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt, warnings, pickle

In [None]:
%cd testi/parquet
esp = pd.read_parquet("ESPERTI_PS.pqt")
ist = pd.read_parquet("ISTITUZIONI_PS.pqt")
reg = pd.read_parquet("REGIONI_PS.pqt")
new = pd.read_parquet("NEWS_PS.pqt")
twi = twi = pd.concat((pd.read_parquet("TWEET_P.pqt"),pd.read_parquet("TWEET_S.pqt")),1)
%cd ../..

In [None]:
esp["dataset"] = "Esperti"
ist["dataset"] = "Istituzioni"
reg["dataset"] = "Regioni"
twi["dataset"] = "Tweet"
new["dataset"] = "Notizie"
new.rename(columns={"date":"datetime"},inplace=True)

# SENTIMENT

In [None]:
#@title Parole di rilievo

def wordRelevance(words,scores,**kwargs):
  def betaSkew(x):
    beta = None
    try:
      beta = stats.beta.fit(x,floc=0,fscale=1,method="mm")
    except:
      try:
        beta = stats.beta.fit(x,floc=0,fscale=1,method="mle")
      except:
        print("Impossible fit beta on data:")
        print(x)
        return 0
    return stats.beta.stats(*beta,moments="s")

  cv = CountVectorizer(**kwargs)
  mat = cv.fit_transform(words)
  sel = mat.toarray()
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    base = betaSkew(scores)
    par = [0]*sel.shape[1]
    names = cv.get_feature_names()
    print("Estimating importance of %d words" % len(names))
    for i in range(sel.shape[1]):
      pos = sel[:,i] != 0
      new = betaSkew(scores[pos])
      par[i] = (base - new) * np.log(sum(pos))
  return dict(zip(names, par))

def wordRelPlot(val, cloud={"background_color": "white"}, plot={}):
  wc = WordCloud(**cloud)
  fig = plt.figure(**plot)
  plt.imshow(wc.generate_from_frequencies(val), interpolation='bilinear')
  plt.suptitle("Relevant words")
  return fig

In [None]:
#@title SentiPlot
def plotSentiment(data,freq="W"):
  data["senti"] = np.stack(data.sentiment).argmax(1)
  data["senti"] = data.senti.astype("category").cat.rename_categories(("Positivo","Neutrale","Negativo"))
  df = data.groupby([pd.Grouper(key="datetime",freq=freq),"senti"]).size().reset_index().rename(columns={0:"N"})
  df["D"] = df.groupby([pd.Grouper(key="datetime",freq="W")]).transform("sum").N
  df["freq"] = df.N / df.D
  if freq =="W":
    df.datetime -= pd.DateOffset(days=3)
  if freq =="2W":
    df.datetime -= pd.DateOffset(days=7)
  fig = px.area(df, x="datetime", y="freq", color="senti", line_group="senti", line_shape="spline",
                color_discrete_sequence=('#00CC96', '#636EFA', '#EF553B'), range_y=(0,1))
  fig.update_layout(yaxis_title='Frequenza',
            legend_title="Sentiment",
          xaxis_title='',
          xaxis = dict(
            ticktext = ["","Mar20","","Mag20","","Lug20","","Set20","","Nov20","","Gen21","","Mar21","","Mag21","","Lug21",""],
            tickvals = pd.date_range("2020-02-01","2021-08-01",freq='MS'),
            tickmode = "array"
          ), width=1100,height=420)
  return fig

def plotSentiments(data,freq="W"):
  df = pd.DataFrame(data.sentiment.tolist()).rename(columns={0:"positivo",1:"neutrale",2:"negativo"})
  data["senti"] = df.positivo - df.negativo
  data = data.groupby(["dataset",pd.Grouper(key="datetime",freq=freq)]).agg(senti=("senti","mean")).reset_index()
  fig = px.line(data, x="datetime", y="senti", line_shape="spline", color="dataset", line_group="dataset", range_y=(-1,1))
  return fig

def plotFreq(df,freq="W"):
  df = df.groupby(["dataset",pd.Grouper(key="datetime",freq=freq)]).size().reset_index().rename(columns={0:"N"})
  df["D"] = df.groupby(["dataset"]).transform("sum").N
  df["freq"] = df.N / df.D * 100
  fig = px.line(df, x="datetime", y="freq",hover_data=["N"], color="dataset", line_group="dataset", line_shape="spline")
  return fig

def plotEmotion(data,freq="W"):
  df = pd.DataFrame(data.emotion.tolist()).rename(columns={0:"Rabbia",1:"Paura",2:"Gioia",3:"Tristezza"})
  df["date"] = data.datetime
  df = df.groupby([pd.Grouper(key="date",freq=freq)])
  df = pd.concat((df.date.count().rename("count"),df.mean()),1).reset_index()
  if freq =="W":
    df.date -= pd.DateOffset(days=3)
  if freq =="2W":
    df.date -= pd.DateOffset(days=7)
  fig = px.area(df, x="date", y=["Rabbia","Paura","Tristezza","Gioia"],hover_data=["count"], line_shape="spline", range_y=(0,1))
  
  fig.update_layout(yaxis_title='Frequenza',
              legend_title="Emotion",
            xaxis_title='',
            xaxis = dict(
              ticktext = ["","Mar20","","Mag20","","Lug20","","Set20","","Nov20","","Gen21","","Mar21","","Mag21","","Lug21",""],
              tickvals = pd.date_range("2020-02-01","2021-08-01",freq='MS'),
              tickmode = "array"
            ), width=1100,height=420)
  
  return fig

In [None]:
words = wordRelevance(esp.preprocess, np.stack(esp.sentiment)[:,2],min_df=100,token_pattern=r"(?u)\b(?<!\.|\/|\?|#)\w{3,}(?!:|\.)\b")
wordRelPlot(words)

In [None]:
words = wordRelevance(esp.preprocess, np.stack(esp.sentiment)[:,0],min_df=100)
wordRelPlot(words)

In [None]:
sub = lambda x: x.loc[:,["datetime","dataset"]]
full = pd.concat((sub(twi),sub(reg),sub(new),sub(esp),sub(ist)),ignore_index=True)

plotFreq(full,"2W").show()

In [None]:
plotSentiment(twi).show()
plotSentiment(esp,"2W").show()
plotSentiment(ist,"2W").show()
plotSentiment(reg).show()
plotSentiment(new).show()

In [None]:
sub = lambda x: x.loc[:,["datetime","dataset","sentiment"]]
full = pd.concat((sub(twi),sub(reg),sub(new),sub(esp),sub(ist)),ignore_index=True)

plotSentiments(full,"M").show()

In [None]:
plotEmotion(ist,"2W").show()
plotEmotion(reg).show()
plotEmotion(new).show()
plotEmotion(esp,"2W").show()
plotEmotion(twi).show()

In [None]:
def tfidft(docs,dates,freq="M",**kwargs):
  cv = CountVectorizer(**kwargs)
  tdm = cv.fit_transform(docs).toarray()
  groups = pd.DataFrame({"dt":dates}).groupby([pd.Grouper(key="dt",freq=freq)]).groups
  idfd = (tdm!=0).sum(0)
  idfn = tdm.shape[0]
  tfn = np.zeros((len(groups),tdm.shape[1]))
  tfd = np.zeros((len(groups),1))
  for i,k in enumerate(groups.values()):
    tfn[i,:] = tdm[k,:].sum(0)
    tfd[i,0] = len(k)
  
  tfidf = (tfn / tfd) * np.log(idfn / idfd)

  return tfidf, cv.get_feature_names(), list(groups.keys())


In [None]:
x,names,dates = tfidft(ist.preprocess,ist.datetime,"M",min_df=1e-2)

In [None]:
fig = go.Figure()
for i in range(len(names)):
  fig.add_trace(go.Scatter(
      x=list(dates),y=x[:,i],
      opacity=.3, showlegend=False,
      name = names[i], hoverinfo ="text", text=names[i],
      line_color=px.colors.qualitative.Alphabet[i%26],
      mode="lines",line_shape='spline'
      ))
fig.show()