# Spanish morphology dataset creator

## Installations and imports

In [None]:
!pip install transformers
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/pytorch_weights.tar.gz
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/vocab.txt
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/config.json
!tar -xzvf pytorch_weights.tar.gz
!mv config.json pytorch/.
!mv vocab.txt pytorch/.

!pip install -U spacy
!python -m spacy download es_core_news_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
--2023-03-27 16:45:08--  https://users.dcc.uchile.cl/~j

In [None]:
from transformers import BertTokenizer
import pandas as pd
import spacy
import re
import os



## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Create the tokenizer and the model

In [None]:
tokenizer = BertTokenizer.from_pretrained("pytorch/", do_lower_case=False)
vocab = tokenizer.get_vocab() # len = 31002 tokens

## Optional: create the first lists of affixes from morphynet

In [None]:
# derivatives_path = '/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/spa.derivational.v1.tsv'
# affix_df = pd.read_csv(derivatives_path, sep = "\t", header = None, usecols = [4,5])
# pref_list = []
# suf_list = []
# for index, row in affix_df.iterrows():
#   if row[5] == 'prefix':
#     if row[4] in vocab.keys():
#       pref_list.append(row[4])
#   if row[5] == 'suffix':
#     if "##"+row[4] in vocab.keys():
#       suf_list.append(row[4])
# pref_list = list(set(pref_list))
# suf_list = list(set(suf_list))
# pd.DataFrame(pref_list).to_csv("/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/all-prefixes.csv")
# pd.DataFrame(suf_list).to_csv("/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/all-suffixes.csv")

## Read  manually created list of prefixes and suffixes

In [None]:
final_pref = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/es-final-pref.csv", header=None)
final_suf = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/es-final-suf.csv", header=None)

## Load the tokenizer

In [None]:
nlp = spacy.load("es_core_news_md")

## Iterate through all texts and extract both datasets 

In [None]:
import time
start = time.time()

prefix_dataset = pd.DataFrame(columns=['derivative', 'base', 'sentence'])
suffix_dataset = pd.DataFrame(columns=['derivative', 'base', 'sentence'])
rootdir = '/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        path = os.path.join(subdir, file)
        print(path)

        #preprocessing
        with open(path,'r') as f:
            file = f.read().split(".")
        def clean_text(line):
            line = re.sub(r'-+',' ',line)
            line = re.sub(r'[^a-zA-Z, áéíóúÁÉÍÓÚüÜñÑ]+'," ",line)
            line = re.sub(r'[ ]+'," ",line)
            line += "."
            return line
        text_lst = []
        for line in file:
          line = clean_text(line)
          text_lst.append(line)
        
        for line in text_lst:
          #prefix
          for index, row in final_pref.iterrows():
            prefix = row[0]
            derivative = re.search(r'\b'+prefix+'[a-zA-Z0-9ñÑáéíóúÁÉÍÓÚüÜñÑ]+',line)
            if derivative:
              base = re.search(r'(?<=\b'+prefix+')[a-zA-Z0-9ñÑáéíóúÁÉÍÓÚüÜñÑ]+',line).group(0)
              base_spacy = nlp(base)
              lemma = base_spacy[0].lemma_
              if lemma in vocab.keys() and len(lemma)>3:
                sentence = line.replace(derivative.group(0), '[MASK]')
                crow = pd.DataFrame.from_dict({'derivative':[derivative.group(0)],'base':[lemma],'sentence': [sentence]})
                prefix_dataset = pd.concat([prefix_dataset,crow], ignore_index=True)
                break

          #suffix
          for index, row in final_suf.iterrows():
            suffix = row[0]
            derivative = re.search(r'[a-zA-Z0-9ñÑáéíóúÁÉÍÓÚüÜñÑ]+'+suffix+r'\b',line)
            if derivative:
              base = re.search(r'[a-zA-Z0-9ñÑáéíóúÁÉÍÓÚüÜñÑ]+(?='+suffix+r'\b)',line).group(0)
              base_spacy = nlp(base)
              lemma = base_spacy[0].lemma_
              if lemma in vocab.keys() and len(lemma)>3:
                sentence = line.replace(derivative.group(0), '[MASK]')
                crow = pd.DataFrame.from_dict({'derivative':[derivative.group(0)],'base':[lemma],'sentence': [sentence]})
                suffix_dataset = pd.concat([suffix_dataset,crow], ignore_index=True)
                break

        end = time.time()
        print("Document processed")
        print(end - start)

/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_76
Document processed
122.79730749130249
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_20
Document processed
268.5384976863861
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_17
Document processed
405.95993733406067
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_12
Document processed
529.8944702148438
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_16
Document processed
657.037778377533
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_13
Document processed
784.2273368835449
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_11
Document processed
915.7347667217255
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA/wiki_10
Document processed
1038.8724496364594
/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/text/AA

## Download datasets as csv

In [None]:
prefix_dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/prefix_dataset.csv", encoding="utf-8")

In [None]:
suffix_dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/spanish-morphology/data/suffix_dataset.csv", encoding="utf-8")