In [1]:
import os

import pandas as pd
from pandas import DataFrame, Series

In [2]:
#set home directory path
hdir = os.path.expanduser('~')

#pahlavi corpus directory
pah_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pahlavi_corpus/"

#pickle path
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [3]:
df_pahcorp = pd.read_csv (os.path.join(pickle_path,r'pahlavi_corpus.csv'))

In [4]:
df_pahcorp.head()

Unnamed: 0,title,line,index,token
0,PT36,1,0,PT36.
1,PT36,1,1,Abar
2,PT36,1,2,Madan
3,PT36,1,3,ī
4,PT36,1,4,Šā-Wahrām


----

## Key Word in Context

In [5]:
def index_kwic (term):
    
    """This function returns a dataframe filtered by the search term."""
    
    result = df_pahcorp[df_pahcorp['token'].str.match(term)]
    return result
    
    # str.match; the str part is telling match how to behave; .match is a method specific to pandas
    
    
# add regex functionality: 

In [6]:
def kwic_pah (term):
    
    for i, item in index_kwic(term).iterrows():
        
        title = item["title"]
        line = item["line"]
        
        filtered = df_pahcorp[(df_pahcorp['title']==title)&(df_pahcorp['line']==line)]
        # equivalent syntax: df_pahcorp.query(f'title == "{title}" and line == {line}')
        
        filtered = filtered.sort_values("index")
        # probably already sorted, but better to be on the safe side
                
        text = " ".join(filtered["token"])
        
        print(f'{title}: {line}\n{text}\n')
        
        # task: figure out how to color code results; termcolor package, has to be installed



        
# iterrows(): research what this does exactly, has something to do with dataframes being composed of series
        

In [7]:
#df_pahcorp["token"] == "afsōn"

In [11]:
kwic_pah ("hāmōn")

Pahlavi Rivāyat: 16b.10
u-š guft kū abr hāmōn wārān andar abāyēd +wirōzag kunēd ēk abāg did kōxšišn kārezār andar miyazd kunēnd mardōm ī wad [Tc44] andar miyazd rasēnd ud hamē ka ēdōn bawēd miyazd bē ō dēwān mad bawēd ∵

Pahlavi Rivāyat: 31c.7
tō kē Zarduxšt hē ka tangīh bawād ka frāxīh az-im dēn guftan abāz ma est čē ēn zamīg ka hāmōn bē bawēd āb ka andar zrēy bē estēd ud sēn-murw ka andar wēšag āšyān kunēd ud harw kē pad gēhān [Tc90] ka abāz [D59] āxēzēd ud xwaršēd ka pad ān rāh rawēd harw kē pad gēhān ka a-wināh bē bawēd Ahrimen ud dēwān ka abesīhēd ud hamāg yazišn ud niyāyišn ī Ohrmazd ud amahrspandān pad dēn bawēd

Dādestān ī dēnīg II: 69.2
pāsox ∵ ēd kū gyāg-iz kū kōf paydāgīg nēst wizandag būdan <’y’k'> pad-iz dēn rōšn paydāg kū pēš-iz az rōyišn ī kōfān ka būm hamāg hāmōn būd gēhān čandēnīdārīh hām-gēhān wizandag būd

Dādestān ī dēnīg II: 90.2
pāsox ēd kū āsmān ast gerd ud frāx buland u-š andarrōn hāmōn frāx-xāyagīhā ∵ u-š gētīy-ēw rōšnīh sag ī hamāg sagān saxttom hu

In [29]:
# Conditional frequency

# two problems: (1) find where the words are; (2) find what words are next to a particular location

# collect on i-1 i+1; counter is a function in the collections module that takes a list and turns it into a frequency dictionary
## A column is technically a series, which can work like a list.

In [37]:
indexed = list(index_kwic('hāmōn').index)

cd_index = []
length = len(indexed)
for i in range(length):
    cd_index.append((indexed[i]-1, indexed[i], indexed[i]+1))
    
    
    
# now need to use this (and counter module?) to count frequency of the terms immediately before and after, i.e. positions 0 and 2 in the tuple

In [38]:
cd_index

[(233989, 233990, 233991),
 (240938, 240939, 240940),
 (430296, 430297, 430298),
 (435883, 435884, 435885),
 (510282, 510283, 510284),
 (529918, 529919, 529920)]

In [44]:
for x in cd_index:
    print (cd_index[x][1])

TypeError: list indices must be integers or slices, not tuple

In [43]:
cd_index[1][1]

240939

In [49]:
df_pahcorp.iloc[240938]

title    Pahlavi Rivāyat
line                31c.7
index                  18
token                  ka
Name: 240938, dtype: object