In [1]:
from itertools import cycle
import pandas as pd
import numpy as np
import os
import json

In [2]:
files = [file for file in os.listdir('conjugations') if file.endswith('.csv')]

In [34]:
_columns = {
    'pess': 'person',
    'conj': 'conjugation'
}

def _person(s):
    _person = {
        'eu': '1s',
        'tu': '2s',
        'ele/ela': '3s',
        'nós': '1p',
        'vós': '2p',
        'eles/elas': '3p',
        None: None,
        np.nan: np.nan
    }
    return _person[s]

_time = {
    'futuro': 'fdpres',
    'condicional': 'fdpret',
    'pret. mais-que-perfeito': 'pmpf',
    'presente': 'p',
    'pretérito perfeito': 'pp',
    'pretérito imperfeito': 'pi',
    'infinitivo pessoal': 'ip',
    'afirmativo': 'af',
    'negativo': 'n'
}



def _mode(row):
    _mode = {
        'indicativo': 'ind',
        'conjuntivo subjuntivo (br)': 'sub',
        'imperativo': 'imp',
        'gerundio': 'ger',
        'participio': 'par',
        np.nan: np.nan
    }
    if row.time != 'ip':
        return _mode[row['mode']]
    else:
        return 'ip'

_person_filter = ['1s', '3s', '1p', '3p', None, np.nan]

In [35]:
(pd.concat([pd.read_csv(f'conjugations/{file}') for file in files])
   .rename(columns=_columns)).assign(time=lambda df: df.time.map(_time),
                                    mode=lambda df: df.apply(_mode, axis=1))

Unnamed: 0,conjugation,infinitive,mode,person,time
0,removendo,remover,ger,,
1,removido,remover,par,,
2,removo,remover,ind,eu,p
3,removi,remover,ind,eu,pp
4,removia,remover,ind,eu,pi
5,removes,remover,ind,tu,p
6,removeste,remover,ind,tu,pp
7,removias,remover,ind,tu,pi
8,remove,remover,ind,ele/ela,p
9,removeu,remover,ind,ele/ela,pp


In [36]:
def try_strip(s):
    try:
        return s.strip()
    except:
        return s

conjugations = (pd.concat([pd.read_csv(f'conjugations/{file}') for file in files])
                  .rename(columns=_columns)
                  .assign(person=lambda df: df.person.apply(try_strip))
                  .assign(person=lambda df: df.person.map(_person),
                          time=lambda df: df.time.map(_time),
                          mode=lambda df: df.apply(_mode, axis=1))
                  .query('person == @_person_filter')
               )

conjugations.head(20)

Unnamed: 0,conjugation,infinitive,mode,person,time
0,removendo,remover,ger,,
1,removido,remover,par,,
2,removo,remover,ind,1s,p
3,removi,remover,ind,1s,pp
4,removia,remover,ind,1s,pi
8,remove,remover,ind,3s,p
9,removeu,remover,ind,3s,pp
10,removia,remover,ind,3s,pi
11,removemos,remover,ind,1p,p
12,removemos,remover,ind,1p,pp


In [10]:
len(conjugations.infinitive.unique())

997

In [11]:
conjugations.time.unique()

array([nan, 'p', 'pp', 'pi', 'pmpf', 'fdpres', 'fdpret', 'ip', 'af', 'n'],
      dtype=object)

In [12]:
conjugations['mode'].unique()

array(['ger', 'par', 'ind', 'sub', 'imp'], dtype=object)

In [39]:
conj = {}

for verb, verb_df in conjugations.groupby('infinitive'):
    conj[verb] = {}
    for mode, mode_df in verb_df.groupby('mode'):
        if mode in {'ger', 'par'}:
            conj[verb][mode] = mode_df.conjugation.values[0]
        elif mode == 'ip':
            conj[verb][mode] = {}
            for pers, pers_df in mode_df.groupby('person'):
                conj[verb][mode][pers] = pers_df.conjugation.values[0]
        else:
            conj[verb][mode] = {}
            for time, time_df in mode_df.groupby('time'):
                conj[verb][mode][time] = {}
                for pers, pers_df in time_df.groupby('person'):
                    conj[verb][mode][time][pers] = pers_df.conjugation.values[0]

In [40]:
len(conj)

997

In [41]:
with open('conjugations.json', 'w') as file:
    json.dump(conj, file, indent=2)

In [16]:
verbs = pd.read_csv('top_1000_verbs_pt.csv').verbs.values
len(verbs)

997

In [17]:
pd.DataFrame([verb for verb in verbs if verb in conj]).to_csv('verbs.csv', header=False, index=False)