In [1]:
import re
from collections import defaultdict

import pandas as pd
import sentencepiece as spm
from uralicNLP import uralicApi

In [2]:
data = pd.read_csv("./data/lexeme_stems.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28283 entries, 0 to 28282
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lexeme     28283 non-null  object
 1   language   28283 non-null  object
 2   pos        28283 non-null  object
 3   stem_text  28283 non-null  object
 4   contlex    28283 non-null  object
dtypes: object(5)
memory usage: 1.1+ MB


In [4]:
data["contlex"].value_counts()

contlex
N_SAJOS                   2865
N_SAJOS_USESPELLNOSUGG    1640
N_MAINSTUMMUSH            1546
N_SAJOS_ERRORTH           1417
ADV_                       872
                          ... 
A_PUEAQRDES                  1
V_JOAQTTED_ERRORTH           1
NUM_KUEQHTT                  1
NUM_TAQHTT                   1
N_SIYKKK_SEMGAME             1
Name: count, Length: 1233, dtype: int64

In [5]:
len(data["contlex"].unique())

1233

In [6]:
data[data["contlex"] == "V_LAEULLAD"]

Unnamed: 0,lexeme,language,pos,stem_text,contlex
4,läukkad,sms,V,lä%^1VOWukk,V_LAEULLAD
413,äinnad,sms,V,ä%^1VOWinn,V_LAEULLAD
760,njõikkad,sms,V,njõ%^1VOWi%{ʹØ%}kk,V_LAEULLAD
1034,härvvad,sms,V,hä%^1VOWrvv,V_LAEULLAD
1776,häuddad,sms,V,hä%^1VOWudd,V_LAEULLAD
1958,åimmad,sms,V,å%^1VOWimm,V_LAEULLAD
2251,äiddad,sms,V,ä%^1VOWidd,V_LAEULLAD
2717,tåiddad,sms,V,tå%^1VOWidd,V_LAEULLAD
2884,läuddad,sms,V,lä%^1VOWudd,V_LAEULLAD
3331,näuddad,sms,V,nä%^1VOWudd,V_LAEULLAD


## Prepare data for BPE

In [7]:
data["pos"].unique()

array(['V', 'N', 'A', 'Adv', 'CS', 'CC', 'Adp', 'Pcle', 'Det', 'Interj',
       'Num', 'Pron', 'Po'], dtype=object)

In [8]:
selected_pos_data = data[data['pos'].isin(['N', 'V'])]

In [9]:
pattern = re.compile(r'^[A-Z]+_.+')

filtered_data = selected_pos_data[selected_pos_data['contlex'].apply(lambda x: bool(pattern.match(x)))]

In [10]:
filtered_data

Unnamed: 0,lexeme,language,pos,stem_text,contlex
0,taibsted,sms,V,taaibâst,V_MAINSTED
1,ääʹll,sms,N,ää%{ʹØ%}ll,N_SAAQMM
2,njââʹllvaaldõs,sms,N,njââʹllvaaldõ%^1VOW%{ʹØ%}s,N_SAJOS
4,läukkad,sms,V,lä%^1VOWukk,V_LAEULLAD
5,laukkõõllâd,sms,V,laukkõõ%{ʹØ%}ll,V_LAUKKOOLLYD
...,...,...,...,...,...
28278,jieʹllidåhttar,sms,N,jieʹlli#dåhttar,N_AANAR
28279,nuõrrǥaž,sms,N,nuõrrǥ,N_MEERSAZH
28280,looǥǥâlm,sms,N,looǥǥâlm,N_COOGGYLM
28281,paneelsaǥstõõllmõš,sms,N,panẹẹl#saǥ»stõõll»mõ%^1VOW%{ʹØ%}š,N_SAJOS


In [11]:
miniparadigms = {
    "V": ["V+Ind+Prs+ConNeg", "V+Ind+Prs+Sg3", "V+Ind+Prt+Sg1", "V+Ind+Prt+Sg3", "V+Inf", "V+Ind+Prs+Sg1", "V+Pass+PrfPrc", "V+Ind+Prs+Pl3", "V+Imprt+Sg3", "V+Imprt+Pl3"],
    "N": ["N+Sg+Loc", "N+Sg+Ill", "N+Pl+Gen", "N+Sg+Nom", "N+Sg+Gen", "N+Sg+Loc+PxSg3", "N+Ess", "N+Der/Dimin+N+Sg+Nom", "N+Der/Dimin+N+Sg+Gen", "N+Sg+Ill+PxSg1"],
    # "A": ["A+Attr"],
}

In [22]:
for index, row in filtered_data.iterrows():
    lexeme = row['lexeme']
    pos = row['pos']
    all_forms = []
    if pos in miniparadigms:
        forms = miniparadigms[pos]
        for form in forms:
            result = uralicApi.generate(f"{lexeme}+{form}", "sms", dictionary_forms=False)
            if result:
                all_forms.extend([_f[0] for _f in result])
        
        filtered_data.loc[index, "forms"] = " ".join([_f for _f in all_forms])

In [23]:
filtered_data.to_csv('./data/lexeme_data_with_forms_3.csv', index=False)

In [24]:
with open('./data/bpe_lexeme_text_3.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join([f"{row['lexeme']} {row['forms']}" for _, row in filtered_data.iterrows()]))

In [25]:
spm.SentencePieceTrainer.Train('--input=./data/bpe_lexeme_text_3.txt --model_prefix=skolt_bpe_3 --vocab_size=2000 --model_type=bpe')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=./data/bpe_lexeme_text_3.txt --model_prefix=skolt_bpe_3 --vocab_size=2000 --model_type=bpe
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./data/bpe_lexeme_text_3.txt
  input_format: 
  model_prefix: skolt_bpe_3
  model_type: BPE
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 