In [1]:
import os
import pandas as pd
from typing import *
from collections import defaultdict

In [None]:
main_dir = '/Users/marjan/Desktop/project/'
UD_dir = os.path.join(main_dir, 'data', 'UD_Persian-PerDT', 'fa_perdt-ud-train.conllu')
persian_adj_dir = os.path.join(main_dir, 'output', 'Persian', 'persian_adjs.csv')
persian_adv_dir = os.path.join(main_dir, 'output', 'Persian', 'persian_advs.csv')
persian_intersection_dir = os.path.join(main_dir, 'output', 'Persian', 'persian_inter.csv')

In [None]:
def ADJ_ADV_intersection(adj_df: pd.DataFrame, adv_df: pd.DataFrame) -> pd.DataFrame:
    intersection = adv_df[['adv_count', 0]][adv_df.index.isin(adj_df.index)]
    intersection.rename(columns={0: 'adv_example'}, inplace=True)
    intersection.insert(0, 'adj_count', 0)
    intersection.insert(2, 'adj_example', None)

    for token in intersection.index:
        intersection.loc[token, 'adj_count'] = adj_df.loc[token, 'adj_count']
        intersection.loc[token, 'adj_example'] = adj_df.loc[token, 1]

    return intersection

In [None]:
def create_df(dct: Dict[str, List[str]], category='adj') -> pd.DataFrame:
    count_dct = {}
    for token in dct:
        count_dct[token] = len(dct[token])

    df = pd.DataFrame.from_dict(dct, orient='Index')
    df.insert(0, category+'_count', 0)

    for token in dct:
        df.loc[token, category + '_count'] = count_dct[token]

    return df

In [None]:
def get_UD_data(dir: str) -> (Dict[str, List[str]], Dict[str, List[str]]):
    with open(dir) as file:
        lines = file.readlines()

        ADJs = defaultdict(list)
        ADVs = defaultdict(list)
        text = []
        for i, line in enumerate(lines):
            line = line.split()
            if line and i > 0:

                if line[0] == '#' and line[1] == 'text':
                    text = line[3:]

                if line[0].isdigit():
                    token = line[1]
                    tag = line[3]

                    if tag == 'ADJ':
                        ADJs[token] += [' '.join(text)]
                    elif tag == 'ADV':
                        ADVs[token] += [' '.join(text)]
    return ADJs, ADVs

In [None]:
persian_adjs, persian_advs = get_UD_data(UD_dir)

In [None]:
persian_adj_df = create_df(persian_adjs)
persian_adv_df = create_df(persian_advs, category='adv')

In [None]:
persian_adj_df.to_csv(persian_adj_dir)
persian_adv_df.to_csv(persian_adv_dir)

In [None]:
persian_intersection = ADJ_ADV_intersection(persian_adj_df, persian_adv_df)
persian_intersection.to_csv(persian_intersection_dir)