In [None]:
!pip install scikit-learn
!pip install pandas
!pip install matplotlib_venn
!pip install beautifulsoup4
!pip install plotly
!pip install pickle
!pip install transformers

In [None]:
import os
import sklearn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import numpy as np
import sys
from matplotlib_venn import venn2
from bs4 import BeautifulSoup, Tag
import matplotlib.pyplot as plt  
import plotly.graph_objects as go
import string
import pickle
from transformers import BertTokenizerFast, BertForTokenClassification, pipeline


In [None]:
# Paths to folders
PATH_CLEANED_VIKIDIA_PAGES = ''
PATH_TERENCE = ''
PATH_AOA = ""
# Path destination in 'Final'
PATH_DESTINATION_PATH = "../../../final_code"

In [None]:
os.getcwd()

In [None]:
# Move to Vikidia cleaned pages' path
os.chdir(PATH_CLEANED_VIKIDIA_PAGES)

In [None]:
list_all_files = sorted(os.listdir())

In [None]:
def word_in_letter_set(word, acceptable_chars):
    # Check if the word is inside the italian dictionary or in exception list
    validation = set(word)
    if validation.issubset(acceptable_chars):
        return True
    else:
        return False
   

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("osiria/bert-italian-uncased-ner")
model = BertForTokenClassification.from_pretrained("osiria/bert-italian-uncased-ner").to("cuda")
ner = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy="first")

In [None]:
def find_subarray(arr1, arr2):
    
    arr1t = [str(a1).lower() for a1 in arr1]
    
    n = len(arr1t)
    m = len(arr2)

    # Iter on ARR1 to find position where ARR2 starts
    for i in range(0, n - m + 1):
        # Check if subarray of ARR1 is equal to ARR2
        if arr1t[i:i + m] == arr2:
            return i
    
    # If not found ARR2 in ARR1, return -1
    return -1

In [None]:
# Check type for words and acceptable chars for words
check_word_in = ("AUX", "VERB", "ADJ", "ADV", "")
acceptable_chars = set(
    string.ascii_lowercase + "àèìòùáéíóú" + " "
)

In [None]:
simple_file_all = []

# Load all text of Vikidia
for name_file in list_all_files:
    if name_file.startswith("viki"):
        with open(name_file, 'r', encoding="utf8") as f:

            file_content = str(f.read().encode().decode())

            first_pos = file_content.find("</h2>") + 5

            last_pos = file_content[first_pos:].find("<h2>") + first_pos
            if last_pos < first_pos:
                last_pos = len(file_content)
            
            
            page_html_format = "<html><head></head><body>"+ file_content +"</body></html>"
            page_to_bs = BeautifulSoup(page_html_format, 'html.parser')
            for tag in page_to_bs.find_all('h2'):
                tag.decompose()
            for tag in page_to_bs.find_all('h3'):
                tag.decompose()
            for tag in page_to_bs.find_all('h4'):
                tag.decompose()
            for tag in page_to_bs.find_all('h5'):
                tag.decompose()
            all_full_text = page_to_bs.find("body").get_text()

            stripped_all_full_text = all_full_text.strip()
            
            simple_file_all.append(stripped_all_full_text.lower())

In [None]:
# Scrape data from Terence
def scrape_simplified_data(name_file, tag_parent, id_control = None):
    with open(name_file, 'r', encoding="utf8") as f:
        file_content = str(f.read().encode().decode())
        file_content = "<file>"+ file_content +"<file>"
        page_to_bs = BeautifulSoup(file_content, 'xml')
        if id_control == True:
            semplified_text = page_to_bs.find_all(tag_parent)[1]
        else:
            semplified_text = page_to_bs.find(tag_parent)
        arr_simpl_text = []
        for child in semplified_text.children:
            text = child.get_text()
            if text != "\n":
                arr_simpl_text.append(text.lower())
        return arr_simpl_text


In [None]:
# Move to Terence's path
os.chdir(PATH_TERENCE)

In [None]:
list_all_folder = os.listdir()

In [None]:
os.getcwd()

In [None]:
all_terence = []
# Load text of Terence
for folder in list_all_folder:
    os.chdir(folder)
    list_all_file = [file_simpl for file_simpl in os.listdir() if ".txt" in file_simpl]
    for file_to_scrape in list_all_file:
        arr_scraped = scrape_simplified_data(file_to_scrape, "semplificato")

        all_terence = all_terence + arr_scraped
    os.chdir("..")


In [None]:
all_terence

In [None]:
italian_stopwords = ["a", 
"ad",
"agl",
"agli",
"alle",
"allo",
"c", 
"che",
"chi",
"ci",
"coi",
"col",
"con",
"cui",
"da",
"dagl",
"dagli",
"dai",
"dal",
"dall",
"dalla",
"dalle",
"dallo",
"degl"
"degli",
"dei",
"del",
"dell",
"della",
"delle",
"dello",
"di",
"e",
"ed",
"era",
"eri",
"ero",
"fu",
"gli",
"ha",
"ho",
"i",
"il",
"in",
"io",
"l"
"la"
"le",
"lei",
"li",
"lo",
"lui",
"ma",
"mi",
"ne",
"negl",
"negli",
"nei",
"nel",
"nell",
"nella",
"nelle",
"nello",
"noi",
"non",
"o",
"per",
"piu",
"se",
"sei",
"si",
"sia",
"sta",
"sto",
"su",
"sugl",
"sugli",
"sui",
"sul",
"sull",
"sulla",
"sulle",
"sullo",
"ti",
"tra",
"tu",
"un",
"una",
"uno",
"vi",
"voi",
".",
",",
"'",
"\"",
"!",
"@",
"#",
"?",
"(",
")",
"/",
"\\",
"[",
"]",
"{",
"}",
":",
";",
"^",
"“",
"”"]

In [None]:
def list_set_to_set_big(list_set):
    every_list = []
    for mini_set in list_set:
        every_list.extend(list(mini_set))
    
    return set(every_list)

# VIKIDIA ALL + AGE + TERENCE

In [None]:
os.getcwd()

In [None]:
os.chdir(PATH_AOA)

In [None]:
# Only Age of Acquistion under 11
aoa_dataset = pd.read_excel("itAoA.xlsx")
aoa_dataset = aoa_dataset[["Ita_Word", "M_AoA", "WordClass"]]
aoa_dataset_filterd_by_age = aoa_dataset[aoa_dataset["M_AoA"] <= 11][["Ita_Word", "WordClass"]]

In [None]:
all_aoa = aoa_dataset_filterd_by_age["Ita_Word"].tolist()
len(all_aoa)

In [None]:
# Unique array
final_file_all = []
final_file_all.extend(simple_file_all)
final_file_all.extend(all_terence)
final_file_all.extend(all_aoa)

In [None]:
count_vectorizer_all = CountVectorizer(strip_accents="unicode", analyzer="word", stop_words= italian_stopwords)
matrix_final_count_all = count_vectorizer_all.fit_transform(final_file_all)
tfidf_vectorizer_all = TfidfTransformer()
tfidf_vectorizer_all.fit(matrix_final_count_all.toarray())
feature_names_all = count_vectorizer_all.get_feature_names_out()
len_feature_names_all = len(feature_names_all)

In [None]:
matrix_count_simple_all = count_vectorizer_all.transform(simple_file_all)
simple_matrix_term_all = np.where(matrix_count_simple_all.toarray() > 0, feature_names_all[:], '')
len(simple_matrix_term_all)

In [None]:
del matrix_count_simple_all

In [None]:
matrix_count_terence = count_vectorizer_all.transform(all_terence)
terence_matrix_term_all = np.where(matrix_count_terence.toarray() > 0, feature_names_all[:], '')
len(terence_matrix_term_all)

In [None]:
del matrix_count_terence

In [None]:
list_set_simple_all = []
for row in simple_matrix_term_all:
    list_set_simple_all.append(set(row))

list_set_terence = []
for row in terence_matrix_term_all:
    list_set_terence.append(set(row))

In [None]:
every_set_terence = list_set_to_set_big(list_set_terence)
every_set_simple_all = list_set_to_set_big(list_set_simple_all)

In [None]:
cleaned_every_set_terence = set([ item for item in list(every_set_terence) if word_in_letter_set(item, acceptable_chars) ])
cleaned_every_set_simple_all = set([item for item in list(every_set_simple_all) if word_in_letter_set(item, acceptable_chars) ])
cleaned_every_set_aoa = set(aoa_dataset_filterd_by_age["Ita_Word"].tolist())

In [None]:
print(len(cleaned_every_set_terence))
print(len(cleaned_every_set_simple_all))
print(len(cleaned_every_set_aoa))

In [None]:
cleaned_every_set_total_all = cleaned_every_set_terence | cleaned_every_set_simple_all | cleaned_every_set_aoa

In [None]:
len(cleaned_every_set_total_all)

In [None]:
venn = venn2([cleaned_every_set_simple_all, cleaned_every_set_aoa], set_labels=('Vikidia-all', 'itAoA' ))

for text in venn.subset_labels:
    # Check label existence (None if empty intersection)
    if text:  
        text.set_fontsize(8)

plt.title("Word Intersection")
plt.show()

In [None]:
os.chdir(PATH_DESTINATION_PATH)

In [None]:
with open('cleaned_every_set_total_all.pickle', 'wb') as file:
    pickle.dump(cleaned_every_set_total_all, file)