In [1]:
import os
import pandas as pd
from typing import List, Tuple, Dict
from collections import defaultdict

In [2]:
root_dir = '/Users/marjan/Desktop/project/'

In [3]:
def ADJ_ADV_intersection(adjs: pd.DataFrame, advs: pd.DataFrame) -> pd.DataFrame:
    df1 = adjs[adjs['word'].isin(advs['word'])]
    df2 = advs[advs['word'].isin(adjs['word'])]
    
    df = pd.concat([df1, df2])
    df = df[['word', 'tag', 'adj_count', 'adv_count', 'sent', 'source', 'src_sent_count', 'src_word_count']]
    df = df.sort_values(by= ['word', 'tag'])

    for token in df2.word.unique():
        df.loc[df['word'] == token, 'adj_count'] = len(df1[df1['word'] == token])
        df.loc[df['word'] == token, 'adv_count'] = len(df2[df2['word'] == token])

    df['adj_count'] = df['adj_count'].astype(int)
    df['adv_count'] = df['adv_count'].astype(int)
    
    return df

In [4]:
def create_df(lst: List[Tuple[str, str]], sent_count: int, word_count: int, category='adj', threshold=1) -> pd.DataFrame:
    count = category + '_count'

    df = pd.DataFrame(lst, columns =['word', 'sent'])
    df = df.drop_duplicates()
    df.insert(0, count, 0)
    df['tag'] = category
    df['source'] = 'UD'
    df['src_sent_count'] = sent_count
    df['src_word_count'] = word_count

    for token in df['word'].unique():
        df.loc[df['word'] == token, count] = len(df[df['word'] == token])
        
    df = df[df[count] >= threshold].sort_values(by= [count], ascending=False)
    
    return df[['word', count, 'tag', 'sent', 'source', 'src_sent_count', 'src_word_count']]

In [5]:
def get_UD_data(dirs: List[str], adj_tags=['ADJ'], adv_tags= ['ADV']) -> (List[Tuple[str, str]], List[Tuple[str, str]]):
    ADJs = []
    ADVs = []
    tagset = []
    text = []
    sent_counter = 0
    word_counter = 0
    for file_name in dirs:
        with open(file_name) as file:
            lines = file.readlines() 
            for i, line in enumerate(lines):
                line = line.split()
                if line and i > 0:
                    if line[0] == '#' and line[1] == 'text':
                        text = line[3:]
                        sent_counter += 1

                    if line[0].isdigit():
                        token = line[1]
                        tag = line[3]
                        tagset += [tag]
                        word_counter += 1

                        if tag in adj_tags:
                            ADJs += [(token, ' '.join(text))]
                        elif tag in adv_tags:
                            ADVs += [(token, ' '.join(text))]
                        
    print('tagset:', sorted(set(tagset)))
    print('number of words:', word_counter)
    print('number of sentences:', sent_counter)
            
    return ADJs, ADVs, sent_counter, word_counter

In [6]:
def input_paths(files: List[str], root= root_dir, lang_dir= 'PER_files') -> List[str]:
    lst = []
    directory = os.path.join(root, 'data', lang_dir)
    for name in files:
        lst.append(os.path.join(directory, name))
    return lst

In [7]:
#######################################################################

In [8]:
#input directiries for Persian
Persian_file_names = ['fa_perdt-ud-dev.conllu',
                 'fa_seraji-ud-train.conllu', 'fa_seraji-ud-test.conllu', 'fa_seraji-ud-dev.conllu']
PERfile_paths = input_paths(Persian_file_names)

In [9]:
#output directories for Persian
persian_adj_dir = os.path.join(root_dir, 'output', 'Persian', 'persian_adjs.csv')
persian_adv_dir = os.path.join(root_dir, 'output', 'Persian', 'persian_advs.csv')
persian_intersection_dir = os.path.join(root_dir, 'output', 'Persian', 'persian_inter.csv')

In [10]:
#input directiries for Enlglish
English_file_names = ['en_partut-ud-dev.conllu', 'en_partut-ud-train.conllu', 'en_partut-ud-test.conllu', 
                 'en_pud-ud-test.conllu', 'en_lines-ud-dev.conllu', 'en_lines-ud-train.conllu', 'en_lines-ud-test.conllu']
ENGfile_paths = input_paths(English_file_names, lang_dir='ENG_files' )

In [11]:
#output directories for English
English_adj_dir = os.path.join(root_dir, 'output', 'English', 'English_adjs.csv')
English_adv_dir = os.path.join(root_dir, 'output', 'English', 'English_advs.csv')
English_intersection_dir = os.path.join(root_dir, 'output', 'English', 'English_inter.csv')

In [12]:
persian_adjs, persian_advs, pers_sent_count, pers_word_count = get_UD_data(PERfile_paths)

tagset: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'VERB', 'X']
number of words: 178067
number of sentences: 7453


In [13]:
english_adjs, english_advs, eng_sent_count, eng_word_count = get_UD_data(ENGfile_paths)

tagset: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
number of words: 165027
number of sentences: 8333


In [28]:
persian_adj_df = create_df(persian_adjs, pers_sent_count, pers_word_count, threshold=4)[:3333]
persian_adv_df = create_df(persian_advs, pers_sent_count, pers_word_count, category='adv', threshold=4)[:3333]

In [29]:
persian_adv_df['word'].unique()

array(['پس', 'چه', 'بعد', 'پیش', 'خیلی', 'هیچ', 'حتی', 'بسیار', 'نه',
       'البته', 'مثل', 'چرا', 'چنین', 'فقط', 'حالا', 'تنها', 'هم', 'هنوز',
       'امروز', 'قبل', 'شاید', 'دیگر', 'همچنان', 'اکنون', 'کنون', 'چنان',
       'بالا', 'اینجا', 'آنجا', 'نیز', 'مثلا', 'همیشه', 'غیر', 'دیروز',
       'دور', 'مانند', 'چگونه', 'سپس', 'بیرون', 'چی', 'متأسفانه', 'امسال',
       'نزدیک', 'هرگز', 'اصلا', 'هنگام', 'الان', 'نظیر', 'حداقل',
       'اخیراً', 'خوب', 'کاملاً', 'کمی', 'چون', 'همچون', 'همواره', 'کجا',
       'معمولاً', 'گاهی', 'هرچه', 'بویژه', 'واقعاً', 'هم\u200cاکنون',
       'اتفاقا', 'مگر', 'چقدر', 'صرفاً', 'همزمان', 'قبلاً', 'سرانجام',
       'حال', 'روزانه', 'دوباره', 'به\u200cخوبی', 'به\u200cشدت',
       'احتمالاً', 'گویی', 'ظاهراً', 'بشدت', 'واقعا', 'قطعاً',
       'خوشبختانه', 'ناگهان', 'آخر', 'حتماً', 'نه\u200cتنها',
       'به\u200cزودی', 'به\u200cتدریج', 'هیچگاه', 'عملاً', 'دقیقاً',
       'تقریباً', 'اینک'], dtype=object)

In [30]:
english_adj_df = create_df(english_adjs, eng_sent_count, eng_word_count, threshold=4)[:3333]
english_adv_df = create_df(english_advs, eng_sent_count, eng_word_count, category='adv', threshold=4)[:3333]

In [31]:
english_adv_df['word'].unique()

array(['out', 'also', 'up', 'so', 'more', 'only', 'now', 'very', 'then',
       'back', 'just', 'as', 'too', 'there', 'again', 'where', 'down',
       'how', 'well', 'even', 'still', 'here', 'never', 'off', 'at',
       'most', 'on', 'already', 'always'], dtype=object)

In [None]:
pd.set_option("display.max_colwidth", None)

In [38]:
# problem with pharasal verbs ['out', 'up', 'back', 'down', 'off', 'at', 'on']
english_adv_df[english_adv_df['word'] == 'up']['sent']

3767                                                                                                                                                                                                               Perhaps my father was a treasure chest because he seemed to be able to lay up for himself inexhaustible riches.
5542                                                                                                                                                                                                                    You know yourself that the agreement with South Africa was held up because of the fisheries question, too.
4532                                                                                                                                                                                                                    Shrugging, he gives up and I turn to the twice disagreeable chicken and eat guiltily, my appetite spoiled.
3945    Each time he took a wal

In [41]:
english_adv_df[english_adv_df['word'] == 'out']['sent']

3976    Nevertheless, as time wore on he found himself...
5224    I flung out of his hut (he lived all alone in ...
5185    Such a suspicion made one pause – for out ther...
7460                               So we try it out here.
5176    He had served three terms of three years out t...
                              ...                        
4438    At last he called the operator and asked wheth...
7698        Shaking, Harry let Dobby out of the wardrobe.
4475    Visibility in the queue is poor because of the...
4451    In the top of the third St. Louis scored on a ...
7867    We cooked breakfast in the remains of the fryi...
Name: sent, Length: 222, dtype: object

In [40]:
pd.reset_option("display.max_colwidth")

In [17]:
persian_adj_df[persian_adj_df['word'] == 'پیدا'].head(3)

Unnamed: 0,word,adj_count,tag,sent,source,src_sent_count,src_word_count
8637,پیدا,112,adj,پیداست که این‌ها نفوذیند؛ این‌ها جزو آن دسته‌ی...,UD,7453,178067
10878,پیدا,112,adj,وی تأکید کرد: اکتفا به حمایت از فرمایشات ایشان...,UD,7453,178067
3022,پیدا,112,adj,بعد از این که سریال را به من پیشنهاد دادند که ...,UD,7453,178067


In [18]:
persian_intersection = ADJ_ADV_intersection(persian_adj_df, persian_adv_df)

In [19]:
persian_intersection['word'].unique()

array(['خوب', 'دیگر'], dtype=object)

In [20]:
english_intersection = ADJ_ADV_intersection(english_adj_df, english_adv_df)

In [21]:
english_intersection['word'].unique()

array(['more'], dtype=object)

In [22]:
english_intersection[(english_intersection['word'] == 'very') & (english_intersection['tag'] == 'adj')]

Unnamed: 0,word,tag,adj_count,adv_count,sent,source,src_sent_count,src_word_count


In [23]:
#Persian adj_adv_intersection to adv ratio:
len(persian_intersection) / (len(persian_adj_df) + len(persian_adv_df))

0.050405040504050404

In [24]:
len(english_intersection) / (len(english_adj_df) + len(english_adv_df))

0.03660366036603661

In [25]:
persian_adj_df.to_csv(persian_adj_dir)
persian_adv_df.to_csv(persian_adv_dir)

In [26]:
persian_intersection.to_csv(persian_intersection_dir)