# collect and count adjacent words
given a list of keywords, this routine considers each word in turn, crawls through the corpus and finds all adjacent words (of the chosen type)  to that word. adjacent word means the word immediately preceding or following the target word, in the same sentence as the target word.

for each keyword, it produces a data frame of adjacent words, whether the adjacent word preceeded or followed, and the frequency of such pairing. 

In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
# input
data_dir = Path("/Users/oholm/annad/ordtidni/Gigaword/output/")
results_dir = Path("/Users/oholm/annad/ordtidni/data/adjacent_adjectives")
input_file_path = data_dir.glob('*/**/*')
filepaths = [item for item in input_file_path if (item.is_file() and str(item).endswith('.parquet'))]
# input: nouns as keywords frequencies
noun_freq_filepath = data_dir/'word_occurrence_counts.parquet'
# output
noun_adj_filepath = data_dir/'noun_adj/nouns_and_pre_post_adj_freq.parquet'

In [3]:
noun_frequencies =\
    pd.read_parquet(noun_freq_filepath)\
    .astype(int)\
    .sort_values(by='occurance_count', ascending=False)

noun_frequencies.head(10)

Unnamed: 0,occurance_count
ár,5743010
mál,3226065
maður,3118502
dagur,2824253
land,1972793
lag,1967438
leikur,1812649
Ísland,1758909
vegur,1701757
tími,1554961


In [4]:
# iterate for each keyword in the list
for keyword in noun_frequencies.index[:3]:
    print(keyword)
keyword = noun_frequencies.index[0]

ár
mál
maður


In [5]:
keywords =noun_frequencies.index[0:30]
print(keywords)

Index(['ár', 'mál', 'maður', 'dagur', 'land', 'lag', 'leikur', 'Ísland',
       'vegur', 'tími', 'lið', 'fólk', 'staður', 'króna', 'forseti', 'barn',
       'félag', 'fyrirtæki', 'stefnandi', 'mark', 'kona', 'leið', 'Reykjavík',
       'hönd', 'starf', 'ríki', 'mánuður', 'hluti', 'milljón', 'þáttur'],
      dtype='object')


In [6]:
%%time
print('crawling through corpus. processing', len(filepaths),\
          'corpus files', len(keywords), ' times')

for keyword in tqdm(keywords):
    
    all_matches = pd.DataFrame()
    print('----'*9, keyword, '----'*9)
    for filepath in tqdm(filepaths):
#         print(filepath.parent)
        corpus_doc = pd.read_parquet(filepath)
        # mark up corpus terms by properties of the previous word
        corpus_doc['preceding_same_sentence'] =\
            ((corpus_doc['Sentence'].shift(-1) == corpus_doc['Sentence'])\
            & (corpus_doc['Paragraph'].shift(-1) == corpus_doc['Paragraph']))
        corpus_doc['preceding_lemma'] = corpus_doc['Lemma'].shift(-1)
        corpus_doc['preceding_isadj'] = corpus_doc['POS'].shift(-1).str.startswith('l')
        # mark up corpus terms by properties of the following word
        corpus_doc['following_same_sentence'] =\
            ((corpus_doc['Sentence'].shift(1) == corpus_doc['Sentence'])\
            & (corpus_doc['Paragraph'].shift(1) == corpus_doc['Paragraph']))
        corpus_doc['following_lemma'] = corpus_doc['Lemma'].shift(1)
        corpus_doc['following_isadj'] = corpus_doc['POS'].shift(1).str.startswith('l')
#         display(corpus_doc.head(100).tail(10)) 

        # focus on nouns that are preceded by or followed by an adjective.
        # 1. first filter on targer keyword.
        # 2. then ensure preceding word is in same sentence
        # 3. then ensure preceding word is adjective
        # (repeat for following word)
        targets_with_matching_preceding =\
            (corpus_doc['Lemma']==keyword)\
            & (corpus_doc['preceding_same_sentence'] )\
            & (corpus_doc['preceding_isadj'])
        targets_with_matching_following =\
            (corpus_doc['Lemma']==keyword)\
            & (corpus_doc['following_same_sentence'] )\
            & (corpus_doc['following_isadj'])
        # collect matching preceding lemmas only
        target_preceded_occurences_df =\
            corpus_doc[['preceding_lemma']][targets_with_matching_preceding]
        target_preceded_occurences_df['placement'] = 'preceding'
        target_preceded_occurences_df\
            .rename(columns={'preceding_lemma':'adjacent_lemma'},
                    inplace=True)
#         display(target_preceded_occurences_df.head(3))
        # collect matching following lemmas only
        target_followed_occurences_df =\
            corpus_doc[['following_lemma']][targets_with_matching_following]
        target_followed_occurences_df['placement'] = 'following'
        target_followed_occurences_df\
            .rename(columns={'following_lemma':'adjacent_lemma'},
                    inplace=True)
        # combine all found so far:
        all_matches =\
            pd.concat([all_matches,
                       target_preceded_occurences_df,
                       target_followed_occurences_df], 
                      axis=0)
        
    all_matches['target_lemma'] = keyword
    display(all_matches)
    print('found', all_matches.shape[0], 'matching pairs')
    output_file = results_dir/ (keyword+'_adjacent_adjectives.parquet')
    all_matches.to_parquet(output_file)




  0%|          | 0/30 [00:00<?, ?it/s]
  0%|          | 0/661 [00:00<?, ?it/s][A

crawling through corpus. processing 661 corpus files 30  times
------------------------------------ ár ------------------------------------



  0%|          | 1/661 [00:01<11:48,  1.07s/it][A
  0%|          | 2/661 [00:02<12:18,  1.12s/it][A
  0%|          | 3/661 [00:03<12:10,  1.11s/it][A
  1%|          | 4/661 [00:04<12:51,  1.17s/it][A
  1%|          | 5/661 [00:05<11:02,  1.01s/it][A
  1%|          | 6/661 [00:05<08:16,  1.32it/s][A
  1%|          | 7/661 [00:05<06:52,  1.59it/s][A
  1%|          | 8/661 [00:06<06:08,  1.77it/s][A
  1%|▏         | 9/661 [00:06<04:38,  2.34it/s][A
  2%|▏         | 10/661 [00:06<05:12,  2.08it/s][A
  2%|▏         | 11/661 [00:08<07:18,  1.48it/s][A
  2%|▏         | 12/661 [00:08<06:59,  1.55it/s][A
  2%|▏         | 13/661 [00:09<07:44,  1.39it/s][A
  2%|▏         | 14/661 [00:10<09:03,  1.19it/s][A
  2%|▏         | 15/661 [00:10<06:59,  1.54it/s][A
  3%|▎         | 17/661 [00:11<05:14,  2.05it/s][A
  3%|▎         | 18/661 [00:11<04:30,  2.38it/s][A
  3%|▎         | 19/661 [00:11<04:02,  2.65it/s][A
  3%|▎         | 20/661 [00:11<03:39,  2.92it/s][A
  3%|▎         | 21/

Unnamed: 0,adjacent_lemma,placement,target_lemma
377,nýr,preceding,ár
1219,undanfarinn,preceding,ár
1318,undanfarinn,preceding,ár
6520,síðari,preceding,ár
8370,rúmur,preceding,ár
...,...,...,...
51522,efnahagslegur,following,ár
52807,gamall,following,ár
53750,gamall,following,ár
58606,liðinn,following,ár


found 1403097 matching pairs


  3%|▎         | 1/30 [6:33:33<190:13:03, 23613.23s/it]
  0%|          | 0/661 [00:00<?, ?it/s][A

------------------------------------ mál ------------------------------------



  0%|          | 1/661 [00:01<16:07,  1.47s/it][A
  0%|          | 2/661 [00:03<16:39,  1.52s/it][A
  0%|          | 3/661 [00:08<30:54,  2.82s/it][A
  1%|          | 4/661 [00:10<25:02,  2.29s/it][A
  1%|          | 5/661 [00:10<19:27,  1.78s/it][A
  1%|          | 6/661 [00:11<14:58,  1.37s/it][A
  1%|          | 7/661 [00:11<13:26,  1.23s/it][A
  1%|          | 8/661 [00:13<13:06,  1.20s/it][A
  1%|▏         | 9/661 [00:13<10:08,  1.07it/s][A
  2%|▏         | 10/661 [00:14<11:03,  1.02s/it][A
  2%|▏         | 11/661 [00:16<13:07,  1.21s/it][A
  2%|▏         | 12/661 [00:17<11:44,  1.09s/it][A
  2%|▏         | 13/661 [00:19<16:11,  1.50s/it][A
  2%|▏         | 14/661 [00:24<26:49,  2.49s/it][A
  2%|▏         | 15/661 [00:24<19:56,  1.85s/it][A
  2%|▏         | 16/661 [00:24<14:16,  1.33s/it][A
  3%|▎         | 17/661 [00:25<11:55,  1.11s/it][A
  3%|▎         | 18/661 [00:26<11:00,  1.03s/it][A
  3%|▎         | 19/661 [00:27<10:21,  1.03it/s][A
  3%|▎         | 20/

Unnamed: 0,adjacent_lemma,placement,target_lemma
1869,smávægilegur,preceding,mál
6029,talaður,preceding,mál
11127,lítill,preceding,mál
16368,ofangreindur,preceding,mál
21135,einn,preceding,mál
...,...,...,...
48502,traustur,following,mál
49574,snúinn,following,mál
52978,margur,following,mál
55476,innfæddur,following,mál


found 434269 matching pairs


  7%|▋         | 2/30 [7:50:25<139:19:23, 17912.99s/it]
  0%|          | 0/661 [00:00<?, ?it/s][A

------------------------------------ maður ------------------------------------



  0%|          | 1/661 [00:01<15:37,  1.42s/it][A
  0%|          | 2/661 [00:03<16:28,  1.50s/it][A
  0%|          | 3/661 [00:04<15:54,  1.45s/it][A
  1%|          | 4/661 [00:05<15:11,  1.39s/it][A
  1%|          | 5/661 [00:06<12:21,  1.13s/it][A
  1%|          | 6/661 [00:06<09:05,  1.20it/s][A
  1%|          | 7/661 [00:06<07:09,  1.52it/s][A
  1%|          | 8/661 [00:06<06:05,  1.79it/s][A
  2%|▏         | 10/661 [00:07<05:07,  2.12it/s][A
  2%|▏         | 11/661 [00:08<06:28,  1.68it/s][A
  2%|▏         | 12/661 [00:08<06:09,  1.76it/s][A
  2%|▏         | 13/661 [00:09<06:43,  1.61it/s][A
  2%|▏         | 14/661 [00:10<07:42,  1.40it/s][A
  2%|▏         | 15/661 [00:10<05:43,  1.88it/s][A
  3%|▎         | 17/661 [00:10<04:16,  2.52it/s][A
  3%|▎         | 18/661 [00:10<03:35,  2.98it/s][A
  3%|▎         | 19/661 [00:11<03:08,  3.40it/s][A
  3%|▎         | 20/661 [00:11<02:45,  3.87it/s][A
  3%|▎         | 21/661 [00:16<17:37,  1.65s/it][A
  3%|▎         | 22

KeyboardInterrupt: 

In [None]:
# 

In [44]:
# import datetime as DT
# d = {'case'        : pd.Series([1,1,1,1,2]),
#      'open'        : pd.Series([DT.datetime(2014, 3, 2), DT.datetime(2014, 3, 2),DT.datetime(2014, 3, 2),DT.datetime(2014, 3, 2),DT.datetime(2014, 3, 2)]),
#      'change'      : pd.Series([DT.datetime(2014, 3, 8), DT.datetime(2014, 4, 8),DT.datetime(2014, 5, 8),DT.datetime(2014, 6, 8),DT.datetime(2014, 6, 8)]),
#      'StartEvent'  : pd.Series(['Homeless','Homeless','Homeless','Homeless','Jail']),
#      'ChangeEvent' : pd.Series(['Homeless','irrelevant','Homeless','Jail','Jail']),
#      'close'       : pd.Series([DT.datetime(2015, 3, 2), DT.datetime(2015, 3, 2),DT.datetime(2015, 3, 2),DT.datetime(2015, 3, 2),DT.datetime(2015, 3, 2)])
#     }
# df=pd.DataFrame(d)
# df2 = df
# df['ok'] = True
# df2['notok'] = False
# pd.concat([df,df2],axis=0)

Unnamed: 0,case,open,change,StartEvent,ChangeEvent,close,ok,notok
0,1,2014-03-02,2014-03-08,Homeless,Homeless,2015-03-02,True,False
1,1,2014-03-02,2014-04-08,Homeless,irrelevant,2015-03-02,True,False
2,1,2014-03-02,2014-05-08,Homeless,Homeless,2015-03-02,True,False
3,1,2014-03-02,2014-06-08,Homeless,Jail,2015-03-02,True,False
4,2,2014-03-02,2014-06-08,Jail,Jail,2015-03-02,True,False
0,1,2014-03-02,2014-03-08,Homeless,Homeless,2015-03-02,True,False
1,1,2014-03-02,2014-04-08,Homeless,irrelevant,2015-03-02,True,False
2,1,2014-03-02,2014-05-08,Homeless,Homeless,2015-03-02,True,False
3,1,2014-03-02,2014-06-08,Homeless,Jail,2015-03-02,True,False
4,2,2014-03-02,2014-06-08,Jail,Jail,2015-03-02,True,False


In [None]:
# def get_nearest_words(corpus_df, lemma_to_look_for, pos_class):
#     """
#     given a lemma, and a corpus dataframe, find all instances of the word in the dataframe, as well as the words
#     preceeding and following each. 
#     and if that word is an adjective, update the count 
#     """
    
    
