#Semantic change detection task 2021-2022<br/>

##1.0_File_txt_Preprocessing and disambiguation<br/>

Matteo Cesaro - matteo.t.cesaro@gmail.com<br/>

# Mounting drive and libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Installing GENRE for disambiguation

https://github.com/facebookresearch/GENRE

In [None]:
!git clone --branch fixing_prefix_allowed_tokens_fn https://github.com/nicola-decao/fairseq
%cd fairseq
!pip install --editable ./

fatal: destination path 'fairseq' already exists and is not an empty directory.
/content/fairseq
Obtaining file:///content/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Installing collected packages: fairseq
  Running setup.py develop for fairseq
Successfully installed fairseq


In [None]:
!pip install --upgrade git+git://github.com/facebookresearch/GENRE.git

Collecting git+git://github.com/facebookresearch/GENRE.git
  Cloning git://github.com/facebookresearch/GENRE.git to /tmp/pip-req-build-76nssi5x
  Running command git clone -q git://github.com/facebookresearch/GENRE.git /tmp/pip-req-build-76nssi5x


In [None]:
!pip install unidecode
!pip install requests
!pip install kilt
#!pip install fairseq
!pip install transformers

[31mERROR: Could not find a version that satisfies the requirement kilt (from versions: none)[0m
[31mERROR: No matching distribution found for kilt[0m


In [None]:
# load the prefix tree (trie)
with open("/.../kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

In [None]:
model = GENRE.from_pretrained("/.../fairseq_entity_disambiguation_blink").eval()

1042301B [00:00, 2294631.45B/s]
456318B [00:00, 1372051.52B/s]


Parameters


In [None]:
 stop_words = set(stopwords.words('english'))
 word_phrases_target = ["(climate) (change)", "(global) (warming)", "(renewable) (energy)", "(carbon) (footprint)"]
 words_to_disambiguate = ["climate", "warming", "temperature", "emission", "pollution"]

## Libraries

In [None]:
#Text preprocessing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#File managing
import glob
import csv
import re
import pandas as pd
import string

#GENRE
import pickle
from genre.trie import Trie
from genre.fairseq_model import GENRE

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing

``Disambiguation with genre``: This allows you to recognize the actual words used in the context of climate change. 
Thanks to GENRE we obtain, for each word to be disambiguated, a list of associated entities, ordered by confidence score.
An appropriate threshold for acceptance of associated entities by Genre was identified through manual validation.
We obtain an average coefficient of -1.6 used to define our confidence threshold.

In [None]:
def GENRE_disambiguation(df_wlp, word_to_disambiguate):

  word_to_disambiguate_low = word_to_disambiguate.lower()
  #new index
  df_wlp["id"] = range(0,len(df_wlp))
  df_wlp.set_index("id", inplace=True)

  #Occurrences of the word to disambiguate
  idx = df_wlp.index[df_wlp["lemma"] == word_to_disambiguate_low]

  #Extracting sentences from . to . that contain the word to disambiguate
  for id in idx:

    start = id
    while (df_wlp.loc[start]["lemma"] != "."):
      start-=1
    start = start+1

    end = id
    while (df_wlp.loc[end]["lemma"] != "."):
      end+=1
    end = end-1

    #I get the raw phrase
    raw_text = df_wlp.loc[start:end]["word"].str.cat(sep=' ')
    #print("RAW_TEXT",raw_text)

    check = False
    #I prepare the raw phrase for GENRE by inserting the delimiters
    if word_to_disambiguate_low == "climate":
      check = re.findall("climate change", raw_text, re.IGNORECASE) #e.g. I have to disambiguate climate. BUT if I already have climate change, there's no point in disambiguating, because then I will subsequently
    #create word phrase by inserting the hyphen, without even running the risk that GENRE may not recognize it.

    if word_to_disambiguate_low == "warming":
      check = re.findall("global warming", raw_text, re.IGNORECASE)

    if not check: #if I didn't find it, so in the example I have only climate without change below, then I have to disambiguate. The same should be done for global warming for example.                                                
      tmp = df_wlp.loc[id, "word"]
      rgx = f"({tmp})" 
      input_GENRE = re.sub(rgx, "[START_ENT]\\1[END_ENT]", raw_text)
      #print("MODIFIED", input_GENRE)
      #print("--")
      output_GENRE = model.sample(input_GENRE,prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),)

      #I take the results within a certain confidence (I get a list of entities, sorted by confidence, within a certain threshold)
      entities_confident = [res["text"].lower() for res in output_GENRE if res["score"].item() > -1.6]

      
      #Disambiguiation with GENRE
      #GENRE -> according to the fixed result label ex: for climate the label will be climate-change
      if entities_confident: #if the list is not empty
        #print(tmp, output_GENRE)
        label = df_wlp.loc[id]["lemma"]
        if word_to_disambiguate_low == "climate":
          if (("climate change" in entities_confident) | ("global warming" in entities_confident)):
            label = "climate-change"
          
        if word_to_disambiguate_low == "warming":
          if (("climate change" in entities_confident) | ("global warming" in entities_confident)| ("human impact on the environment" in entities_confident)):
            label = "global-warming"
        
        if word_to_disambiguate_low == "pollution":
          if (("human impact on the environment" in entities_confident) | ("air pollution" in entities_confident) | ("marine pollution" in entities_confident) | ("greenhouse gas" in entities_confident) | ("water pollution" in entities_confident)):
            label = "pollution_climate-change"

        if word_to_disambiguate_low == "emission":
          if (("air pollution" in entities_confident) | ("greenhouse gas" in entities_confident) | ("carbon dioxide in carth's atmosphere" in entities_confident) | ("exhaust gas" in entities_confident)):
            label = "emission_climate-change"
        
        if word_to_disambiguate_low == "temperature":
          if (("atmosphere of earth" in entities_confident) | ("atmospheric temperature" in entities_confident)):
            label = "temperature_climate-change"
      
        #I replace or concatenate the word to be disambiguated with the entity given back by GENRE  
        df_wlp.loc[id, "lemma"] = label
  return df_wlp

``Preprocessing``:


*   Lemmatization
*   Removing the html tag
*   Removing punctuations
*   Removing stop-words
*   Normalization


In [None]:
def pre_processing(files_list, stop_words_set, word_phrases, words_to_disambiguate):
  string_punct_list = [char for char in string.punctuation]
  files_list.sort()
  full_preproc = str()

  for i in range(0, len(files_list)):

    #Un file alla volta per motivi computazionali
    print(f"Appending file {files_list[i]}")

    #Leggo df
    df = pd.read_csv(files_list[i], sep = "\t", names = ["word", "lemma", "pos"], quoting=csv.QUOTE_NONE, encoding = "unicode_escape")

    #Tolgo i tag html
    df = df[(df["word"] != "<p>")]

    #Disambiguazione
    
    for word in words_to_disambiguate:
      df = GENRE_disambiguation(df, word)
    
    #Pulizia/Rimozione punteggiatura, stopwords ecc..
    df = df[(df["pos"] != "y") & (df["pos"] != "ge")& (df["pos"] != "...")] #ge -> genitivo sassone
    df.loc[df["lemma"] == "n't", "lemma"] = "not"
    mask = df['lemma'].isin(string_punct_list) | df['lemma'].isin(stop_words_set)
    df = df[~mask]

    #Appending
    full_preproc += df["lemma"].str.cat(sep=' ')

    del df
    #del mask
  
  #Concat word-phrases
  for wp in word_phrases:
    full_preproc = re.sub(wp, "\\1-\\2", full_preproc)
    
  return full_preproc

# Newspapers preprocessing and disambiguation 


In [None]:
files_newsppr_wlp = [f for f in glob.glob("/.../wlp_news_znw*/*.txt")]
files_newsppr_wlp.sort()
files_newsppr_wlp

['/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1990.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1991.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1992.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1993.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1994.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1995.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1996.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1997.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1998.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Newspaper/wlp_news_znw/wlp_news_1999.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COC

In [None]:
years = range(1990,2020,1)
j = 0
for i in range(0, len(files_newsppr_wlp)):
  tmp_list = [files_newsppr_wlp[i]]
  res = pre_processing(tmp_list, stop_words, word_phrases_target, words_to_disambiguate)
  with open(f"/.../news_{years[j]}.txt", "w") as file:
    file.write(res)
  print(f"news_{years[j]}.txt")
  j+=1
  print("-----------------------------------------------------------")

# Blog preprocessing and disambiguation 

In [None]:
files_blogs_wlp = [f for f in glob.glob("/.../wlp_blog_qie*/*.txt")]
files_blogs_wlp.sort()
files_blogs_wlp

['/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_01.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_02.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_03.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_04.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_05.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_06.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_07.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_08.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_09.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_10.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Blogs/wlp_blog_qie/wlp_blog_11.txt',
 '/content/drive/MyDr

In [None]:
years = range(1,len(files_blogs_wlp)+1,1)
j = 0
for i in range(0, len(files_blogs_wlp)):
  tmp_list = [files_blogs_wlp[i]]
  res = pre_processing(tmp_list, stop_words, word_phrases_target, words_to_disambiguate)
  with open(f"/.../blog_{years[j]}.txt", "w") as file:
    file.write(res)
  print(f"blog_{years[j]}.txt")
  j+=1
  print("-----------------------------------------------------------")

# Academical preprocessing and disambiguation 


In [None]:
files_acad_wlp = [f for f in glob.glob("/.../wlp_acad_vuw*/*.txt")]
files_acad_wlp.sort()
files_acad_wlp

['/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1990.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1991.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1992.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1993.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1994.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1995.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1996.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1997.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1998.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic/wlp_acad_vuw/wlp_acad_1999.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Academic

In [None]:
years = range(1990,2020,1)
j = 0
for i in range(0, len(files_acad_wlp)):
  tmp_list = [files_acad_wlp[i]]
  res = pre_processing(tmp_list, stop_words, word_phrases_target, words_to_disambiguate)
  with open(f"/.../acad_{years[j]}.txt", "w") as file:
    file.write(res)
  print(f"acad_{years[j]}.txt")
  j+=1
  print("-----------------------------------------------------------")

# Magazine preprocessing and disambiguation 

In [None]:
files_mag_wlp = [f for f in glob.glob("/.../wlp_mag_dhk*/*.txt")]
files_mag_wlp.sort()
files_mag_wlp

['/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1990.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1991.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1992.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1993.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1994.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1995.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1996.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1997.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1998.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag_1999.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Magazine/wlp_mag_dhk/wlp_mag

In [None]:
years = range(1990,2020,1)
j = 0
for i in range(0, len(files_mag_wlp)):
  tmp_list = [files_mag_wlp[i]]
  res = pre_processing(tmp_list, stop_words, word_phrases_target, words_to_disambiguate)
  with open(f"/.../mag_{years[j]}.txt", "w") as file:
    file.write(res)
  print(f"mag_{years[j]}.txt")
  j+=1
  print("-----------------------------------------------------------")

# Spoken preprocessing and disambiguation 

In [None]:
files_spok_wlp = [f for f in glob.glob("/.../wlp_spok_cud*/*.txt")]
files_spok_wlp.sort()
files_spok_wlp

['/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1990.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1991.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1992.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1993.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1994.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1995.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1996.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1997.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1998.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok_1999.txt',
 '/content/drive/MyDrive/WORK AND PROJECTS/NLP/COCA/Spoken/wlp_spok_cud/wlp_spok

In [None]:
years = range(1990,2020,1)
j = 0
for i in range(0, len(files_spok_wlp)):
  tmp_list = [files_spok_wlp[i]]
  res = pre_processing(tmp_list, stop_words, word_phrases_target, words_to_disambiguate)
  with open(f"/.../spok_{years[j]}.txt", "w") as file:
    file.write(res)
  print(f"spok_{years[j]}.txt")
  j+=1
  print("-----------------------------------------------------------")