SETUP

In [None]:
# mount drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/TESI/

In [None]:
%%capture
!pip install datatable
!pip install -U spacy
!python -m spacy download it_core_news_lg
!pip install transformers

In [None]:
import pandas as pd, numpy as np, re
import datatable as dt
import plotly.express as px
import spacy,transformers
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
# Spacy setup
nlp = spacy.load("it_core_news_lg",exclude=["ner"])

# fix tokenizer e stopword
nlp.Defaults.stop_words.update("è","l","i","o","e")
nlp.Defaults.prefixes = ['@','-'] + nlp.Defaults.prefixes
nlp.tokenizer.prefix_search = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes).search
nlp.Defaults.infixes = [r"_+|\d+",r"\b[\(\)\"”?!.:;,*+]\b"] + nlp.Defaults.infixes
nlp.tokenizer.infix_finditer = spacy.util.compile_infix_regex(nlp.Defaults.infixes).finditer
spacy.tokens.Token.set_extension('is_stop', getter=lambda token: token.is_stop or \
                                                    token.lemma_.lower() in nlp.Defaults.stop_words or \
                                                    token.pos_ in ("DET","ADP","AUX","CCONJ","SCONJ"),force=True)

In [None]:
%%capture
# Transformers setup (for sentiment)
# NEED GPU FOR SENTIMENT (text processed: GPU 50/s vs CPU 5/s)
def hfClassifier(pretrained):
  return transformers.pipeline('sentiment-analysis',return_all_scores=True,binary_output=True,device=0,
           model=transformers.AutoModelForSequenceClassification.from_pretrained(pretrained),
           tokenizer=transformers.AutoTokenizer.from_pretrained(pretrained,model_max_length=512))

sentiment = hfClassifier("neuraly/bert-base-italian-cased-sentiment")
emotion   = hfClassifier("MilaNLProc/feel-it-italian-emotion")

In [None]:
# Funzioni utili
def getSentiment(text):
  """
  return 3 values: (pos, neu, neg)
  """
  s = sentiment(text,truncation=True)[0]
  # potrebbe essere interessante calcolare sentiment per ogni doc.span
  # così frasi con più sentiment vengono catturate meglio
  return np.array([s[2]["score"],s[1]["score"],s[0]["score"]])

def getEmotion(text):
  """
  return 3 values: (anger, fear, joy, sadness)
  """
  e = emotion(text,truncation=True)[0]
  # potrebbe essere interessante calcolare sentiment per ogni doc.span
  # così frasi con più sentiment vengono catturate meglio
  return np.array([e[0]["score"],e[1]["score"],e[2]["score"],e[3]["score"]])

def doc_vector(doc,stop=True,punct=True,num=True,lemma=True,lower=True,length=2,unkn=False):
  """
  return document vector (300) as words mean
  params:
    stop   remove stopwords
    punct  remove punctation
    num    remove number-like
    lemma  lemmatize words
    lower  words to lower
    length remove shorter words
    unkn   include unknown words
  """
  tokens = np.empty((0,300))
  for token in doc:
    if (stop and token._.is_stop) or \
      (punct and token.is_punct) or \
      (num and token.like_num) or \
      len(token.text) <= length:
        continue
    word = token.lemma_ if lemma else token.text
    word = word.lower() if lower else word
    if token.vocab[word].has_vector or unkn:
      tokens = np.append(tokens,np.array([token.vocab[word].vector]),0)

  return tokens.mean(0) if tokens.shape[0] else np.zeros(300)

def preprocess(doc,stop=True,punct=True,num=True,lemma=True,lower=True,length=2):
  """
  return preprocessed string
  params:
    stop   remove stopwords
    punct  remove punctation
    num    remove number-like
    lemma  lemmatize words
    lower  words to lower
    length remove shorter words
  """
  processed = " "
  for token in doc:
    if (stop and token._.is_stop) or \
      (punct and token.is_punct) or \
      (num and token.like_num) or \
      len(token.text) <= length:
      if processed[-1] != " ":
        processed += token.whitespace_ 
    else:
      word = (token.lemma_ + " ") if lemma else token.text_with_ws
      processed += word.lower() if lower else word
  
  return processed.strip()

def token_by_pos(doc,stop=False,lemma=True,lower=True):
  """
  return list of tokn by pos tag
  params:
    stop   include stopwords
    lemma  lemmatize words
    lower  words to lower
  """
  processed = {}
  for token in doc:
    if stop or not token._.is_stop:
      if token.pos_ not in processed:
        processed[token.pos_] = []
      word = token.lemma_ if lemma else token.text
      processed[token.pos_].append(word.lower() if lower else word)
  
  return processed

In [None]:
def process(data,proc=True,sent=True):
  if proc:
    print("spacy:nlp")
    data["nlp"] = list(tqdm(nlp.pipe(data.plain_text,n_process=-1),total=len(data)))
    print("spacy:strproc")
    data["preprocess"] = data.nlp.progress_apply(preprocess)
    print("spacy:doc2vec")
    data["doc_vector"] = data.nlp.progress_apply(doc_vector)
    print("spacy:postag")
    data["pos"] = data.nlp.progress_apply(token_by_pos)
  if sent:
    print("trans:senti")
    data["sentiment"] = data.plain_text.progress_apply(getSentiment)
    print("trans:emo")
    data["emotion"]   = data.plain_text.progress_apply(getEmotion)

  return data


In [None]:
!ls testi

# DATASET ESPERTI


In [None]:
esperti = dt.fread("testi/ESPERTI.csv.zip").to_pandas()
#elimina righe non interessanti
quoted = lambda title: ". ".join([x.group(0).strip('"') for x in re.finditer('".+?"', title)])
esperti = esperti.loc[
            (esperti.body.notna())&
            ([row.expert.split(" ")[-1] in row.title for row in esperti.itertuples()])
          ,:].reset_index(drop=True)
# sistema variabili
esperti.expert[esperti.expert=="Lopalco"] = "Pierluigi Lopalco"
esperti["expert"] = esperti.expert.astype("category")
esperti["datetime"] = pd.to_datetime(esperti.date.dt.strftime("%Y-%m-%d") + "T" + esperti.time)
esperti["q_title"] = esperti.title.apply(quoted)
esperti["plain_text"] = esperti.q_title.fillna("") + ". " + esperti.quoted.fillna("")

In [None]:
esperti.expert.cat.categories = ['Zangrillo A.', 'Crisanti A.', 'Viola A.',
       'Pregliasco F.', 'Locatelli F.', 'Gorini G.',
       'Palù G.', 'Rezza G.', 'Ippolito G.', 'Capua I.',
       'Gismondo M.R.', 'Galli M.', 'Bassetti M.',
       'Cartabellotta N.', 'Lopalco P.', 'Sileri P.',
       'Burioni R.', 'Brusaferro S.', 'Ricciardi W.']

In [None]:
esperti["expert"] = esperti.expert.astype("string")
esperti["expert"] = esperti.expert.astype("category")

In [None]:
exp_month = esperti.groupby(["expert",pd.Grouper(key="datetime",freq="MS")])["expert"].agg(["count"]).reset_index()
exp_month.datetime += pd.DateOffset(days=15)

fig = px.line(exp_month, x="datetime", y="count", color="expert", line_group="expert", line_shape="spline",
              color_discrete_sequence=px.colors.qualitative.Alphabet). \
        for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))
fig.update_layout(yaxis_title='Numero articoli',
                  xaxis_title='',
                  xaxis = dict(
                    ticktext = ["","Mar20","","Mag20","","Lug20","","Set20","","Nov20","","Gen21","","Mar21","","Mag21","","Lug21",""],
                    tickvals = pd.date_range("2020-02-01","2021-08-01",freq='MS'),
                    tickmode = "array"
                  ), width=1100,height=520)
fig.show()

In [None]:
esperti.sample(100,random_state=2021).loc[:,["expert","title","body"]].to_csv("../ESPERTI_100.csv")
# body assegnazione errata o multipla 13% - ic 7.9|19.9
# title assegnazione errata 10% - ic 5.5|16.4
# completamente sbagliati 7% - ic 3.3 - 12.7
esperti.to_parquet("../ESPERTI_O.pqt")
esperti.shape

In [None]:
esperti = process(esperti)
# salva colonne d'interesse (il resto recuperabile da originale)
esperti = esperti.loc[:,["expert","datetime","preprocess","doc_vector","pos","sentiment","emotion"]]
esperti.to_parquet("../ESPERTI_PS.pqt")

# DATASET NEWS


In [None]:
news = dt.fread("testi/NEWS.csv.zip").to_pandas()
#articolo parla di covid
covid_args = """
(prima|seconda|terza|nuova) ondata|zona (rossa|arancione|gialla|bianca)|
covid|coronavirus|virus|cts|pandemia|tampon|lockdown|coprifuoco|quarantena|
mascherin|variant|vaccin|contagi|green ?pass|fase (1|2|3|uno|due|tre)
""".replace("\n","")
news["plain_text"] = news.title + ". " + news.body
#elimina righe senza corpo
news = news.loc[news.body!="",:]
news["aboutCovid"] = news.plain_text.str.lower().str.contains(covid_args)

In [None]:
news_week = news.groupby([pd.Grouper(key="date",freq="W"),"aboutCovid"])["aboutCovid"].agg(["count"]). \
                reset_index().pivot("date","aboutCovid","count").reset_index().rename(columns={True:'yes',False:'no'})
news_week["ratio"] = 1/(1 + news_week.no/news_week.yes) *100

news_week.date -= pd.DateOffset(days=3)

fig = px.line(news_week, x="date", y="ratio", line_shape="spline")
fig.update_layout(yaxis_title='Articoli di interesse(%)',
                  xaxis_title='',
                  xaxis = dict(
                    ticktext = ["","Mar20","","Mag20","","Lug20","","Set20","","Nov20","","Gen21","","Mar21","","Mag21","","Lug21",""],
                    tickvals = pd.date_range("2020-02-01","2021-08-01",freq='MS'),
                    tickmode = "array"
                  ), width=1100,height=520)
fig.show()

In [None]:
news = news.loc[news.aboutCovid,:].reset_index(drop=True)
news.loc[:,["title","body"]].sample(100,random_state=2021).to_csv("../NEWS_100.csv")
# non strettamente sul covid 22% - ic 15.4|29.9
# non trattano di covid 4% - ic 1.4|8.9
news.to_parquet("../NEWS_O.pqt")
news.shape

In [None]:
news = process(news)
# salva colonne d'interesse (il resto recuperabile da originale)
news = news.loc[:,["date","preprocess","doc_vector","pos","sentiment","emotion"]]
news.to_parquet("../NEWS_PS.pqt")

# DATASET ISTITUZIONI

In [None]:
istituzioni = dt.fread("testi/ISTITUZIONI.csv.zip").to_pandas()
#tweet parla di covid
covid_args = """
(prima|seconda|terza|nuova) ?ondata|fase ?(1|2|3|uno|due|tre)|distanziament|
virus|coronavirus|covid|pandemi|tampon|lockdown|coprifuoco|quarantena|
pnrr|cts|dpcm|sostegn|epidemi|certifica|immun|mascherin|variant|vaccin|
contagi|sintom|green ?pass|zona ?(rossa|arancione|gialla|bianca)
""".replace("\n","")
#elimina righe senza corpo
istituzioni["aboutCovid"] = istituzioni.text.str.lower().str.contains(covid_args)
istituzioni.rename(columns={"text":"plain_text"},inplace=True)

In [None]:
# post covid assoluti
ist_month = istituzioni.loc[istituzioni.aboutCovid,:].groupby(["user",pd.Grouper(key="datetime",freq="MS")])["user"].agg(["count"]).reset_index()
ist_month.datetime += pd.DateOffset(days=15)
fig = px.line(ist_month, x="datetime", y="count", color="user", line_group="user", line_shape="spline"). \
        for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))
fig.update_layout(yaxis_title='Numero di tweet di interesse',
                  xaxis_title='',
                  xaxis = dict(
                    ticktext = ["","Mar20","","Mag20","","Lug20","","Set20","","Nov20","","Gen21","","Mar21","","Mag21","","Lug21",""],
                    tickvals = pd.date_range("2020-02-01","2021-08-01",freq='MS'),
                    tickmode = "array"
                  ), width=1100,height=520)

fig.show()

#post covid relativi
ist_month = istituzioni.groupby(["user",pd.Grouper(key="datetime",freq="M"),"aboutCovid"])["aboutCovid"].agg(["count"]). \
                reset_index().pivot(["user","datetime"],"aboutCovid","count").reset_index(). \
                rename(columns={True:'yes',False:'no'}).fillna(0)
ist_month["ratio"] = 1/(1 + ist_month.no/ist_month.yes)

fig = px.line(ist_month, x="datetime", y="ratio", color="user", line_group="user", line_shape="spline"). \
        for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))
fig.show()

In [None]:
istituzioni = istituzioni.loc[istituzioni.aboutCovid,:].reset_index(drop=True)
istituzioni.loc[:,["user","plain_text"]].sample(100,random_state=2021).to_csv("../ISTITUZIONI_100.csv")
#non parlano di covid 6% - ic 2.6|11.5
istituzioni.to_parquet("../ISTITUZIONI_O.pqt")
istituzioni.shape

In [None]:
istituzioni = process(istituzioni)
# salva colonne d'interesse (il resto recuperabile da originale)
istituzioni = istituzioni.loc[:,["user","datetime","preprocess","doc_vector","pos","sentiment","emotion"]]
istituzioni.to_parquet("../ISTITUZIONI_PS.pqt")

# DATASET REGIONI

In [None]:
regioni = dt.fread("testi/REGIONI.csv.zip").to_pandas()
#tweet parla di covid
covid_args = """
(prima|seconda|terza|nuova) ?ondata|fase ?(1|2|3|uno|due|tre)|distanziament|
virus|coronavirus|covid|pandemi|tampon|lockdown|coprifuoco|quarantena|
pnrr|cts|dpcm|sostegn|epidemi|certifica|immun|mascherin|variant|vaccin|
contagi|sintom|green ?pass|zona ?(rossa|arancione|gialla|bianca)
""".replace("\n","")
#elimina righe senza corpo
regioni["aboutCovid"] = regioni.text.str.lower().str.contains(covid_args)
regioni.rename(columns={"text":"plain_text"},inplace=True)

In [None]:
# post covid assoluti
reg_month = regioni.loc[regioni.aboutCovid,:].groupby(["user",pd.Grouper(key="datetime",freq="MS")])["user"].agg(["count"]).reset_index()
reg_month.datetime += pd.DateOffset(days=15)

fig = px.line(reg_month, x="datetime", y="count", color="user", line_group="user", line_shape="spline"). \
        for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))
fig.update_layout(yaxis_title='Numero di tweet di interesse',
                  xaxis_title='',
                  xaxis = dict(
                    ticktext = ["","Mar20","","Mag20","","Lug20","","Set20","","Nov20","","Gen21","","Mar21","","Mag21","","Lug21",""],
                    tickvals = pd.date_range("2020-02-01","2021-08-01",freq='MS'),
                    tickmode = "array"
                  ), width=1100,height=500)
fig.show()

#post covid relativi
reg_month = regioni.groupby(["user",pd.Grouper(key="datetime",freq="M"),"aboutCovid"])["aboutCovid"].agg(["count"]). \
                reset_index().pivot(["user","datetime"],"aboutCovid","count").reset_index(). \
                rename(columns={True:'yes',False:'no'}).fillna(0)
reg_month["ratio"] = 1/(1 + reg_month.no/reg_month.yes)

fig = px.line(reg_month, x="datetime", y="ratio", color="user", line_group="user", line_shape="spline"). \
        for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))
fig.show()

In [None]:
regioni = regioni.loc[regioni.aboutCovid,:].reset_index(drop=True)
regioni.loc[:,["user","plain_text"]].sample(100,random_state=2021).to_csv("../REGIONI_100.csv")
#non parlano di covid 1% - ic 0.0|4.7
regioni.to_parquet("../REGIONI_O.pqt")
regioni.shape

In [None]:
regioni = process(regioni)
# salva colonne d'interesse (il resto recuperabile da originale)
regioni = regioni.loc[:,["user","datetime","preprocess","doc_vector","pos","sentiment","emotion"]]
regioni.to_parquet("../REGIONI_PS.pqt")

# DATASET TWEET

In [None]:
tweet = dt.fread("testi/TWEET.csv.zip").to_pandas()
tweet.rename(columns={"text": "plain_text"},inplace=True)

In [None]:
sources = tweet["source"].value_counts()
tw_source = sources[[0,1,2,3,7,11]]
tw_source["Altro"] = sources.sum() - tw_source.sum()

tw_source = tw_source.reset_index().rename(columns={"index":"source","source":"count"})

fig = px.pie(tw_source, names="source", values="count")
fig.show()

In [None]:
tweet = tweet.loc[tweet["source"].isin(('Twitter Web App','Twitter Web Client','Twitter for Android',
                                'Twitter for Mac','Twitter for iPad','Twitter for iPhone')),:].reset_index(drop=True)

tweet.loc[:,["user","plain_text"]].sample(100,random_state=2021).to_csv("../TWEET_100.csv")
#non parlano di covid 2% - ic 0.0|6.2
tweet.to_parquet("../TWEET_O.pqt")
tweet.shape

In [None]:
tweet = process(tweet,sent=False)
# salva colonne d'interesse (il resto recuperabile da originale)
tweet = tweet.loc[:,["datetime","preprocess","doc_vector","pos"]]
tweet.to_parquet("../TWEET_P.pqt")

In [None]:
tweet = process(tweet,proc=False)
# salva colonne d'interesse (il resto recuperabile da originale)
tweet = tweet.loc[:,["sentiment","emotion"]]
tweet.to_parquet("../TWEET_S.pqt")

In [None]:
tweet = pd.concat((pd.read_parquet("../TWEET_P.pqt"),pd.read_parquet("../TWEET_S.pqt")),axis=1)