In [1]:
import os
import pandas as pd
from typing import List, Tuple, Dict
from collections import defaultdict

In [2]:
root_dir = '/Users/marjan/Desktop/project/'

In [3]:
#input directiries for Persian
Persian_file_names = ['fa_perdt-ud-dev.conllu',
                 'fa_seraji-ud-train.conllu', 'fa_seraji-ud-test.conllu', 'fa_seraji-ud-dev.conllu']

In [4]:
#output directories for Persian
persian_adj_dir = os.path.join(root_dir, 'output', 'Persian', 'persian_adjs.csv')
persian_adv_dir = os.path.join(root_dir, 'output', 'Persian', 'persian_advs.csv')
persian_intersection_dir = os.path.join(root_dir, 'output', 'Persian', 'persian_inter.csv')

In [5]:
#input directiries for Enlglish
English_file_names = ['en_partut-ud-dev.conllu', 'en_partut-ud-train.conllu', 'en_partut-ud-test.conllu', 
                 'en_pud-ud-test.conllu', 'en_lines-ud-dev.conllu', 'en_lines-ud-train.conllu', 'en_lines-ud-test.conllu']

In [6]:
#output directories for English
English_adj_dir = os.path.join(root_dir, 'output', 'English', 'English_adjs.csv')
English_adv_dir = os.path.join(root_dir, 'output', 'English', 'English_advs.csv')
English_intersection_dir = os.path.join(root_dir, 'output', 'English', 'English_inter.csv')

In [7]:
#functions

In [8]:
def create_df(lst: List[Tuple[str, str, str]], sent_count: int, word_count: int, tagset=['ADV', 'ADJ'], src = 'UD')-> pd.DataFrame:
    
    df = pd.DataFrame(lst, columns =['word', 'tag', 'sent'])
    df = df.drop_duplicates()
    
    for tag in tagset:
        df[tag+'_count'] = 0
        
    df['source'] = src
    df['src_sent_count'] = sent_count
    df['src_word_count'] = word_count
    
    for token in df['word'].unique():  
        mask = df['word'] == token
        curr_tags = df[mask]['tag'].unique()
        for tag in curr_tags:
            if tagset[0] in curr_tags:
                df.loc[mask, tagset[0]+'_count'] = len(df[mask & (df['tag'] == tagset[0])])
            if tagset[1] in curr_tags:
                df.loc[mask, tagset[1]+'_count'] = len(df[mask & (df['tag'] == tagset[1])])
        
    
    return df

In [9]:
def overlap(df, tagset= ['ADJ', 'ADV'], thresholds=[1,1], sortby= ['ADV_count', 'ADJ_count']):
    for idx, tag in enumerate(tagset):
        df = df[df[tag+'_count'] >= thresholds[idx]]
    if sortby:
        df = df.sort_values(by=sortby+['word'], ascending=False)
    return df

In [10]:
def get_tag_data(dirs: List[str], tags=['ADJ','ADV']) -> (List[Tuple[str, str]], List[Tuple[str, str]]):
    population = []
    tagset = []
    text = []
    sent_counter = 0
    word_counter = 0
    
    for file_name in dirs[:2]:
        with open(file_name) as file:
            lines = file.readlines() 
            for i, line in enumerate(lines):
                line = line.split()
                if line and i > 0:
                    if line[0] == '#' and line[1] == 'text':
                        text = line[3:]
                        sent_counter += 1

                    if line[0].isdigit():
                        token = line[1]
                        tag = line[3]
                        tagset += [tag]
                        word_counter += 1

                        if tag in tags:
                            population += [(token, tag, ' '.join(text))]
                            

    print('tagset:', sorted(set(tagset)))
    print('total number of words:', word_counter)
    print('total number of sentences:', sent_counter)
            
    return population, sent_counter, word_counter

In [11]:
def input_paths(files: List[str], root= root_dir, lang_dir= 'PER_files') -> List[str]:
    lst = []
    directory = os.path.join(root, 'data', lang_dir)
    for name in files:
        lst.append(os.path.join(directory, name))
    return lst

In [12]:
#######################################################################

In [13]:
PERfile_paths = input_paths(Persian_file_names)

In [14]:
ENGfile_paths = input_paths(English_file_names, lang_dir='ENG_files' )

In [15]:
pers_adjv, pers_sent_count, pers_word_count = get_tag_data(PERfile_paths)

tagset: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'VERB', 'X']
total number of words: 146211
total number of sentences: 6254


In [32]:
eng_adjv, eng_sent_count, eng_word_count = get_tag_data(ENGfile_paths)

tagset: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
total number of words: 46226
total number of sentences: 1937


In [33]:
pers_df = create_df(pers_adjv, pers_sent_count, pers_word_count)
eng_df = create_df(eng_adjv, eng_sent_count, eng_word_count)

In [34]:
pers_df.head()

Unnamed: 0,word,tag,sent,ADV_count,ADJ_count,source,src_sent_count,src_word_count
0,دینی,ADJ,به گزارش خبرنگار مهر در گرگان، بر اساس باورهای...,0,39,UD,6254,146211
1,بعد,ADJ,به گزارش خبرنگار مهر در گرگان، بر اساس باورهای...,104,13,UD,6254,146211
2,عزادار,ADJ,به گزارش خبرنگار مهر در گرگان، بر اساس باورهای...,0,1,UD,6254,146211
3,ایرانی,ADJ,این میهمانی به منظور آشنایی هم‌تیمی‌های او با ...,0,75,UD,6254,146211
4,نزدیک,ADJ,با نزدیک شدن مهلت پیشنهادی دولت روسیه، مبنی بر...,24,10,UD,6254,146211


In [46]:
pers_overlap = overlap(pers_df, thresholds=[4,4])
eng_overlap = overlap(eng_df, thresholds=[4,4])

In [47]:
pers_overlap

Unnamed: 0,word,tag,sent,ADV_count,ADJ_count,source,src_sent_count,src_word_count
1,بعد,ADJ,به گزارش خبرنگار مهر در گرگان، بر اساس باورهای...,104,13,UD,6254,146211
395,بعد,ADV,اول سفیدی را پوشش می‌دهیم بعد ابرو را تیغ می‌ز...,104,13,UD,6254,146211
441,بعد,ADV,اول جیب‌هایش را گشت، بعد روی کارتن‌ها و زمین را.,104,13,UD,6254,146211
452,بعد,ADJ,دیگر هیچ کارگری، امیدی به روز بعد خود نداشت.,104,13,UD,6254,146211
517,بعد,ADV,"ننه فانوس را خاموش کرد و بعد گفت: ""بعد از آن ح...",104,13,UD,6254,146211
...,...,...,...,...,...,...,...,...
6862,زودتر,ADV,بنابراین به مبتلایان به دیابت توصیه می‌شود حدا...,4,5,UD,6254,146211
7487,زودتر,ADV,دبیر شورای نگهبان پریروز در مصاحبه‌ای با سیما ...,4,5,UD,6254,146211
9763,زودتر,ADJ,وزیر امور خارجه در گفت و گو با خبرگزاری جمهوری...,4,5,UD,6254,146211
10058,زودتر,ADJ,در این دیدار، آقای احمد مسجدجامعی، قائم‌مقام و...,4,5,UD,6254,146211


In [48]:
eng_overlap

Unnamed: 0,word,tag,sent,ADV_count,ADJ_count,source,src_sent_count,src_word_count
100,more,ADV,"In fact, we're more closely related to fungi t...",82,14,UD,1937,46226
115,more,ADJ,"In a single cubic inch of soil, there can be m...",82,14,UD,1937,46226
225,more,ADV,But when innovation affects an automobile's qu...,82,14,UD,1937,46226
229,more,ADV,Women who have completed secondary or tertiary...,82,14,UD,1937,46226
266,more,ADV,Some critics consider Balzac's writing exempla...,82,14,UD,1937,46226
...,...,...,...,...,...,...,...,...
3604,late,ADJ,"In fact, South Korean women participate in the...",4,5,UD,1937,46226
4565,late,ADV,"Although he married late in life, Balzac had a...",4,5,UD,1937,46226
4569,late,ADJ,"""In late April the newly-weds set off for Paris.",4,5,UD,1937,46226
4931,late,ADV,Most playwrights of the period typically colla...,4,5,UD,1937,46226


In [49]:
pers_overlap['word'].unique()

array(['بعد', 'پیش', 'بسیار', 'تنها', 'قبل', 'دیگر', 'نزدیک', 'دور',
       'بالا', 'خوب', 'کمی', 'همزمان', 'روزانه', 'دوباره', 'آخر', 'باز',
       'درست', 'زیاد', 'دیر', 'بهتر', 'تازه', 'سخت', 'زودتر'],
      dtype=object)

In [50]:
eng_overlap['word'].unique()

array(['more', 'most', 'just', 'much', 'later', 'long', 'early', 'first',
       'better', 'late'], dtype=object)

In [51]:
pd.set_option("display.max_colwidth", None)

In [24]:
eng_df[eng_df['word'] == 'bad'].head()

Unnamed: 0,word,tag,sent,ADV_count,ADJ_count,source,src_sent_count,src_word_count
2515,bad,ADV,"Rolf Bolin, who was a professor at the Hopkin's Marine Station where I work, wrote in the 1940s that ""The fumes from the scum floating on the inlets of the bay were so bad they turned lead-based paints black"".",2,2,UD,1937,46226
2652,bad,ADJ,"For the most part, that is not a bad thing.",2,2,UD,1937,46226
4381,bad,ADV,"In Saintsbury's view, ""They are curiously, interestingly, almost enthrallingly bad.",2,2,UD,1937,46226
5108,bad,ADJ,"Alfred Pollard termed some of them ""bad quartos"" because of their adapted, paraphrased or garbled texts, which may in places have been reconstructed from memory.",2,2,UD,1937,46226


In [26]:
pd.reset_option("display.max_colwidth")